summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn
diff options
context:
space:
mode:
authorSimon Marlow <simonmar@microsoft.com>2006-04-07 02:05:11 +0000
committerSimon Marlow <simonmar@microsoft.com>2006-04-07 02:05:11 +0000
commit0065d5ab628975892cea1ec7303f968c3338cbe1 (patch)
tree8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts/gmp/mpn
parent28a464a75e14cece5db40f2765a29348273ff2d2 (diff)
downloadhaskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz
Reorganisation of the source tree
Most of the other users of the fptools build system have migrated to Cabal, and with the move to darcs we can now flatten the source tree without losing history, so here goes. The main change is that the ghc/ subdir is gone, and most of what it contained is now at the top level. The build system now makes no pretense at being multi-project, it is just the GHC build system. No doubt this will break many things, and there will be a period of instability while we fix the dependencies. A straightforward build should work, but I haven't yet fixed binary/source distributions. Changes to the Building Guide will follow, too.
Diffstat (limited to 'rts/gmp/mpn')
-rw-r--r--rts/gmp/mpn/Makefile.am94
-rw-r--r--rts/gmp/mpn/Makefile.in472
-rw-r--r--rts/gmp/mpn/README13
-rw-r--r--rts/gmp/mpn/a29k/add_n.s120
-rw-r--r--rts/gmp/mpn/a29k/addmul_1.s113
-rw-r--r--rts/gmp/mpn/a29k/lshift.s93
-rw-r--r--rts/gmp/mpn/a29k/mul_1.s97
-rw-r--r--rts/gmp/mpn/a29k/rshift.s89
-rw-r--r--rts/gmp/mpn/a29k/sub_n.s120
-rw-r--r--rts/gmp/mpn/a29k/submul_1.s116
-rw-r--r--rts/gmp/mpn/a29k/udiv.s30
-rw-r--r--rts/gmp/mpn/a29k/umul.s29
-rw-r--r--rts/gmp/mpn/alpha/README224
-rw-r--r--rts/gmp/mpn/alpha/add_n.asm114
-rw-r--r--rts/gmp/mpn/alpha/addmul_1.asm87
-rw-r--r--rts/gmp/mpn/alpha/cntlz.asm68
-rw-r--r--rts/gmp/mpn/alpha/default.m477
-rw-r--r--rts/gmp/mpn/alpha/ev5/add_n.asm143
-rw-r--r--rts/gmp/mpn/alpha/ev5/lshift.asm169
-rw-r--r--rts/gmp/mpn/alpha/ev5/rshift.asm167
-rw-r--r--rts/gmp/mpn/alpha/ev5/sub_n.asm143
-rw-r--r--rts/gmp/mpn/alpha/ev6/addmul_1.asm474
-rw-r--r--rts/gmp/mpn/alpha/ev6/gmp-mparam.h62
-rw-r--r--rts/gmp/mpn/alpha/gmp-mparam.h64
-rw-r--r--rts/gmp/mpn/alpha/invert_limb.asm345
-rw-r--r--rts/gmp/mpn/alpha/lshift.asm104
-rw-r--r--rts/gmp/mpn/alpha/mul_1.asm71
-rw-r--r--rts/gmp/mpn/alpha/rshift.asm102
-rw-r--r--rts/gmp/mpn/alpha/sub_n.asm114
-rw-r--r--rts/gmp/mpn/alpha/submul_1.asm87
-rw-r--r--rts/gmp/mpn/alpha/udiv_qrnnd.S151
-rw-r--r--rts/gmp/mpn/alpha/umul.asm39
-rw-r--r--rts/gmp/mpn/alpha/unicos.m463
-rw-r--r--rts/gmp/mpn/arm/add_n.S77
-rw-r--r--rts/gmp/mpn/arm/addmul_1.S89
-rw-r--r--rts/gmp/mpn/arm/gmp-mparam.h34
-rw-r--r--rts/gmp/mpn/arm/mul_1.S81
-rw-r--r--rts/gmp/mpn/arm/sub_n.S79
-rw-r--r--rts/gmp/mpn/asm-defs.m41182
-rw-r--r--rts/gmp/mpn/clipper/add_n.s48
-rw-r--r--rts/gmp/mpn/clipper/mul_1.s47
-rw-r--r--rts/gmp/mpn/clipper/sub_n.s48
-rw-r--r--rts/gmp/mpn/cray/README14
-rw-r--r--rts/gmp/mpn/cray/add_n.c96
-rw-r--r--rts/gmp/mpn/cray/addmul_1.c46
-rw-r--r--rts/gmp/mpn/cray/gmp-mparam.h27
-rw-r--r--rts/gmp/mpn/cray/mul_1.c44
-rw-r--r--rts/gmp/mpn/cray/mulww.f54
-rw-r--r--rts/gmp/mpn/cray/mulww.s245
-rw-r--r--rts/gmp/mpn/cray/sub_n.c97
-rw-r--r--rts/gmp/mpn/cray/submul_1.c46
-rw-r--r--rts/gmp/mpn/generic/add_n.c62
-rw-r--r--rts/gmp/mpn/generic/addmul_1.c65
-rw-r--r--rts/gmp/mpn/generic/addsub_n.c167
-rw-r--r--rts/gmp/mpn/generic/bdivmod.c120
-rw-r--r--rts/gmp/mpn/generic/bz_divrem_n.c153
-rw-r--r--rts/gmp/mpn/generic/cmp.c56
-rw-r--r--rts/gmp/mpn/generic/diveby3.c77
-rw-r--r--rts/gmp/mpn/generic/divrem.c101
-rw-r--r--rts/gmp/mpn/generic/divrem_1.c248
-rw-r--r--rts/gmp/mpn/generic/divrem_2.c151
-rw-r--r--rts/gmp/mpn/generic/dump.c76
-rw-r--r--rts/gmp/mpn/generic/gcd.c414
-rw-r--r--rts/gmp/mpn/generic/gcd_1.c77
-rw-r--r--rts/gmp/mpn/generic/gcdext.c700
-rw-r--r--rts/gmp/mpn/generic/get_str.c216
-rw-r--r--rts/gmp/mpn/generic/gmp-mparam.h27
-rw-r--r--rts/gmp/mpn/generic/hamdist.c94
-rw-r--r--rts/gmp/mpn/generic/inlines.c24
-rw-r--r--rts/gmp/mpn/generic/jacbase.c136
-rw-r--r--rts/gmp/mpn/generic/lshift.c87
-rw-r--r--rts/gmp/mpn/generic/mod_1.c175
-rw-r--r--rts/gmp/mpn/generic/mod_1_rs.c111
-rw-r--r--rts/gmp/mpn/generic/mul.c190
-rw-r--r--rts/gmp/mpn/generic/mul_1.c59
-rw-r--r--rts/gmp/mpn/generic/mul_basecase.c87
-rw-r--r--rts/gmp/mpn/generic/mul_fft.c772
-rw-r--r--rts/gmp/mpn/generic/mul_n.c1343
-rw-r--r--rts/gmp/mpn/generic/perfsqr.c123
-rw-r--r--rts/gmp/mpn/generic/popcount.c93
-rw-r--r--rts/gmp/mpn/generic/pre_mod_1.c69
-rw-r--r--rts/gmp/mpn/generic/random.c43
-rw-r--r--rts/gmp/mpn/generic/random2.c105
-rw-r--r--rts/gmp/mpn/generic/rshift.c88
-rw-r--r--rts/gmp/mpn/generic/sb_divrem_mn.c201
-rw-r--r--rts/gmp/mpn/generic/scan0.c62
-rw-r--r--rts/gmp/mpn/generic/scan1.c62
-rw-r--r--rts/gmp/mpn/generic/set_str.c159
-rw-r--r--rts/gmp/mpn/generic/sqr_basecase.c83
-rw-r--r--rts/gmp/mpn/generic/sqrtrem.c509
-rw-r--r--rts/gmp/mpn/generic/sub_n.c62
-rw-r--r--rts/gmp/mpn/generic/submul_1.c65
-rw-r--r--rts/gmp/mpn/generic/tdiv_qr.c401
-rw-r--r--rts/gmp/mpn/generic/udiv_w_sdiv.c131
-rw-r--r--rts/gmp/mpn/hppa/README91
-rw-r--r--rts/gmp/mpn/hppa/add_n.s58
-rw-r--r--rts/gmp/mpn/hppa/gmp-mparam.h63
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/addmul_1.s102
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/mul_1.s98
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s75
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S189
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s83
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s80
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s76
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S195
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/submul_1.s111
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S80
-rw-r--r--rts/gmp/mpn/hppa/hppa1_1/umul.s42
-rw-r--r--rts/gmp/mpn/hppa/hppa2_0/add_n.s88
-rw-r--r--rts/gmp/mpn/hppa/hppa2_0/sub_n.s88
-rw-r--r--rts/gmp/mpn/hppa/lshift.s66
-rw-r--r--rts/gmp/mpn/hppa/rshift.s63
-rw-r--r--rts/gmp/mpn/hppa/sub_n.s59
-rw-r--r--rts/gmp/mpn/hppa/udiv_qrnnd.s286
-rw-r--r--rts/gmp/mpn/i960/README9
-rw-r--r--rts/gmp/mpn/i960/add_n.s43
-rw-r--r--rts/gmp/mpn/i960/addmul_1.s48
-rw-r--r--rts/gmp/mpn/i960/mul_1.s45
-rw-r--r--rts/gmp/mpn/i960/sub_n.s43
-rw-r--r--rts/gmp/mpn/lisp/gmpasm-mode.el351
-rw-r--r--rts/gmp/mpn/m68k/add_n.S79
-rw-r--r--rts/gmp/mpn/m68k/lshift.S150
-rw-r--r--rts/gmp/mpn/m68k/mc68020/addmul_1.S83
-rw-r--r--rts/gmp/mpn/m68k/mc68020/mul_1.S90
-rw-r--r--rts/gmp/mpn/m68k/mc68020/submul_1.S83
-rw-r--r--rts/gmp/mpn/m68k/mc68020/udiv.S31
-rw-r--r--rts/gmp/mpn/m68k/mc68020/umul.S31
-rw-r--r--rts/gmp/mpn/m68k/rshift.S149
-rw-r--r--rts/gmp/mpn/m68k/sub_n.S79
-rw-r--r--rts/gmp/mpn/m68k/syntax.h177
-rw-r--r--rts/gmp/mpn/m88k/add_n.s104
-rw-r--r--rts/gmp/mpn/m88k/mc88110/add_n.S200
-rw-r--r--rts/gmp/mpn/m88k/mc88110/addmul_1.s61
-rw-r--r--rts/gmp/mpn/m88k/mc88110/mul_1.s59
-rw-r--r--rts/gmp/mpn/m88k/mc88110/sub_n.S276
-rw-r--r--rts/gmp/mpn/m88k/mul_1.s127
-rw-r--r--rts/gmp/mpn/m88k/sub_n.s106
-rw-r--r--rts/gmp/mpn/mips2/add_n.s120
-rw-r--r--rts/gmp/mpn/mips2/addmul_1.s97
-rw-r--r--rts/gmp/mpn/mips2/lshift.s95
-rw-r--r--rts/gmp/mpn/mips2/mul_1.s85
-rw-r--r--rts/gmp/mpn/mips2/rshift.s92
-rw-r--r--rts/gmp/mpn/mips2/sub_n.s120
-rw-r--r--rts/gmp/mpn/mips2/submul_1.s97
-rw-r--r--rts/gmp/mpn/mips2/umul.s30
-rw-r--r--rts/gmp/mpn/mips3/README23
-rw-r--r--rts/gmp/mpn/mips3/add_n.s120
-rw-r--r--rts/gmp/mpn/mips3/addmul_1.s97
-rw-r--r--rts/gmp/mpn/mips3/gmp-mparam.h58
-rw-r--r--rts/gmp/mpn/mips3/lshift.s95
-rw-r--r--rts/gmp/mpn/mips3/mul_1.s85
-rw-r--r--rts/gmp/mpn/mips3/rshift.s92
-rw-r--r--rts/gmp/mpn/mips3/sub_n.s120
-rw-r--r--rts/gmp/mpn/mips3/submul_1.s97
-rw-r--r--rts/gmp/mpn/mp_bases.c550
-rw-r--r--rts/gmp/mpn/ns32k/add_n.s46
-rw-r--r--rts/gmp/mpn/ns32k/addmul_1.s48
-rw-r--r--rts/gmp/mpn/ns32k/mul_1.s47
-rw-r--r--rts/gmp/mpn/ns32k/sub_n.s46
-rw-r--r--rts/gmp/mpn/ns32k/submul_1.s48
-rw-r--r--rts/gmp/mpn/pa64/README38
-rw-r--r--rts/gmp/mpn/pa64/add_n.s90
-rw-r--r--rts/gmp/mpn/pa64/addmul_1.S167
-rw-r--r--rts/gmp/mpn/pa64/gmp-mparam.h65
-rw-r--r--rts/gmp/mpn/pa64/lshift.s103
-rw-r--r--rts/gmp/mpn/pa64/mul_1.S158
-rw-r--r--rts/gmp/mpn/pa64/rshift.s100
-rw-r--r--rts/gmp/mpn/pa64/sub_n.s90
-rw-r--r--rts/gmp/mpn/pa64/submul_1.S170
-rw-r--r--rts/gmp/mpn/pa64/udiv_qrnnd.c111
-rw-r--r--rts/gmp/mpn/pa64/umul_ppmm.S74
-rw-r--r--rts/gmp/mpn/pa64w/README2
-rw-r--r--rts/gmp/mpn/pa64w/add_n.s90
-rw-r--r--rts/gmp/mpn/pa64w/addmul_1.S168
-rw-r--r--rts/gmp/mpn/pa64w/gmp-mparam.h65
-rw-r--r--rts/gmp/mpn/pa64w/lshift.s103
-rw-r--r--rts/gmp/mpn/pa64w/mul_1.S159
-rw-r--r--rts/gmp/mpn/pa64w/rshift.s100
-rw-r--r--rts/gmp/mpn/pa64w/sub_n.s90
-rw-r--r--rts/gmp/mpn/pa64w/submul_1.S171
-rw-r--r--rts/gmp/mpn/pa64w/udiv_qrnnd.c117
-rw-r--r--rts/gmp/mpn/pa64w/umul_ppmm.S72
-rw-r--r--rts/gmp/mpn/power/add_n.s79
-rw-r--r--rts/gmp/mpn/power/addmul_1.s122
-rw-r--r--rts/gmp/mpn/power/lshift.s56
-rw-r--r--rts/gmp/mpn/power/mul_1.s109
-rw-r--r--rts/gmp/mpn/power/rshift.s54
-rw-r--r--rts/gmp/mpn/power/sdiv.s34
-rw-r--r--rts/gmp/mpn/power/sub_n.s80
-rw-r--r--rts/gmp/mpn/power/submul_1.s127
-rw-r--r--rts/gmp/mpn/power/umul.s38
-rw-r--r--rts/gmp/mpn/powerpc32/add_n.asm61
-rw-r--r--rts/gmp/mpn/powerpc32/addmul_1.asm124
-rw-r--r--rts/gmp/mpn/powerpc32/aix.m439
-rw-r--r--rts/gmp/mpn/powerpc32/gmp-mparam.h66
-rw-r--r--rts/gmp/mpn/powerpc32/lshift.asm145
-rw-r--r--rts/gmp/mpn/powerpc32/mul_1.asm86
-rw-r--r--rts/gmp/mpn/powerpc32/regmap.m434
-rw-r--r--rts/gmp/mpn/powerpc32/rshift.asm60
-rw-r--r--rts/gmp/mpn/powerpc32/sub_n.asm61
-rw-r--r--rts/gmp/mpn/powerpc32/submul_1.asm130
-rw-r--r--rts/gmp/mpn/powerpc32/umul.asm32
-rw-r--r--rts/gmp/mpn/powerpc64/README36
-rw-r--r--rts/gmp/mpn/powerpc64/add_n.asm61
-rw-r--r--rts/gmp/mpn/powerpc64/addmul_1.asm52
-rw-r--r--rts/gmp/mpn/powerpc64/addsub_n.asm107
-rw-r--r--rts/gmp/mpn/powerpc64/aix.m440
-rw-r--r--rts/gmp/mpn/powerpc64/copyd.asm45
-rw-r--r--rts/gmp/mpn/powerpc64/copyi.asm44
-rw-r--r--rts/gmp/mpn/powerpc64/gmp-mparam.h62
-rw-r--r--rts/gmp/mpn/powerpc64/lshift.asm159
-rw-r--r--rts/gmp/mpn/powerpc64/mul_1.asm49
-rw-r--r--rts/gmp/mpn/powerpc64/rshift.asm60
-rw-r--r--rts/gmp/mpn/powerpc64/sub_n.asm61
-rw-r--r--rts/gmp/mpn/powerpc64/submul_1.asm54
-rw-r--r--rts/gmp/mpn/pyr/add_n.s76
-rw-r--r--rts/gmp/mpn/pyr/addmul_1.s45
-rw-r--r--rts/gmp/mpn/pyr/mul_1.s42
-rw-r--r--rts/gmp/mpn/pyr/sub_n.s76
-rw-r--r--rts/gmp/mpn/sh/add_n.s47
-rw-r--r--rts/gmp/mpn/sh/sh2/addmul_1.s53
-rw-r--r--rts/gmp/mpn/sh/sh2/mul_1.s50
-rw-r--r--rts/gmp/mpn/sh/sh2/submul_1.s53
-rw-r--r--rts/gmp/mpn/sh/sub_n.s47
-rw-r--r--rts/gmp/mpn/sparc32/README36
-rw-r--r--rts/gmp/mpn/sparc32/add_n.asm236
-rw-r--r--rts/gmp/mpn/sparc32/addmul_1.asm146
-rw-r--r--rts/gmp/mpn/sparc32/lshift.asm97
-rw-r--r--rts/gmp/mpn/sparc32/mul_1.asm137
-rw-r--r--rts/gmp/mpn/sparc32/rshift.asm93
-rw-r--r--rts/gmp/mpn/sparc32/sub_n.asm326
-rw-r--r--rts/gmp/mpn/sparc32/submul_1.asm146
-rw-r--r--rts/gmp/mpn/sparc32/udiv_fp.asm158
-rw-r--r--rts/gmp/mpn/sparc32/udiv_nfp.asm193
-rw-r--r--rts/gmp/mpn/sparc32/umul.asm68
-rw-r--r--rts/gmp/mpn/sparc32/v8/addmul_1.asm122
-rw-r--r--rts/gmp/mpn/sparc32/v8/mul_1.asm103
-rw-r--r--rts/gmp/mpn/sparc32/v8/submul_1.asm58
-rw-r--r--rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm122
-rw-r--r--rts/gmp/mpn/sparc32/v8/umul.asm31
-rw-r--r--rts/gmp/mpn/sparc32/v9/README4
-rw-r--r--rts/gmp/mpn/sparc32/v9/addmul_1.asm288
-rw-r--r--rts/gmp/mpn/sparc32/v9/gmp-mparam.h69
-rw-r--r--rts/gmp/mpn/sparc32/v9/mul_1.asm267
-rw-r--r--rts/gmp/mpn/sparc32/v9/submul_1.asm291
-rw-r--r--rts/gmp/mpn/sparc64/README48
-rw-r--r--rts/gmp/mpn/sparc64/add_n.asm172
-rw-r--r--rts/gmp/mpn/sparc64/addmul1h.asm203
-rw-r--r--rts/gmp/mpn/sparc64/addmul_1.asm114
-rw-r--r--rts/gmp/mpn/sparc64/copyi.asm79
-rw-r--r--rts/gmp/mpn/sparc64/gmp-mparam.h88
-rw-r--r--rts/gmp/mpn/sparc64/lshift.asm97
-rw-r--r--rts/gmp/mpn/sparc64/mul_1.asm113
-rw-r--r--rts/gmp/mpn/sparc64/mul_1h.asm183
-rw-r--r--rts/gmp/mpn/sparc64/rshift.asm94
-rw-r--r--rts/gmp/mpn/sparc64/sub_n.asm172
-rw-r--r--rts/gmp/mpn/sparc64/submul1h.asm204
-rw-r--r--rts/gmp/mpn/sparc64/submul_1.asm114
-rw-r--r--rts/gmp/mpn/thumb/add_n.s50
-rw-r--r--rts/gmp/mpn/thumb/sub_n.s50
-rw-r--r--rts/gmp/mpn/underscore.h26
-rw-r--r--rts/gmp/mpn/vax/add_n.s61
-rw-r--r--rts/gmp/mpn/vax/addmul_1.s126
-rw-r--r--rts/gmp/mpn/vax/lshift.s58
-rw-r--r--rts/gmp/mpn/vax/mul_1.s123
-rw-r--r--rts/gmp/mpn/vax/rshift.s56
-rw-r--r--rts/gmp/mpn/vax/sub_n.s61
-rw-r--r--rts/gmp/mpn/vax/submul_1.s126
-rw-r--r--rts/gmp/mpn/x86/README40
-rw-r--r--rts/gmp/mpn/x86/README.family333
-rw-r--r--rts/gmp/mpn/x86/addsub_n.S174
-rw-r--r--rts/gmp/mpn/x86/aors_n.asm187
-rw-r--r--rts/gmp/mpn/x86/aorsmul_1.asm134
-rw-r--r--rts/gmp/mpn/x86/copyd.asm80
-rw-r--r--rts/gmp/mpn/x86/copyi.asm79
-rw-r--r--rts/gmp/mpn/x86/diveby3.asm115
-rw-r--r--rts/gmp/mpn/x86/divrem_1.asm232
-rw-r--r--rts/gmp/mpn/x86/k6/README237
-rw-r--r--rts/gmp/mpn/x86/k6/aors_n.asm329
-rw-r--r--rts/gmp/mpn/x86/k6/aorsmul_1.asm372
-rw-r--r--rts/gmp/mpn/x86/k6/cross.pl141
-rw-r--r--rts/gmp/mpn/x86/k6/diveby3.asm110
-rw-r--r--rts/gmp/mpn/x86/k6/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyd.asm179
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyi.asm196
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/lshift.asm286
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/rshift.asm285
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/com_n.asm91
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/logops_n.asm212
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/lshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/popham.asm238
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/rshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mul_1.asm272
-rw-r--r--rts/gmp/mpn/x86/k6/mul_basecase.asm600
-rw-r--r--rts/gmp/mpn/x86/k6/sqr_basecase.asm672
-rw-r--r--rts/gmp/mpn/x86/k7/README145
-rw-r--r--rts/gmp/mpn/x86/k7/aors_n.asm250
-rw-r--r--rts/gmp/mpn/x86/k7/aorsmul_1.asm364
-rw-r--r--rts/gmp/mpn/x86/k7/diveby3.asm131
-rw-r--r--rts/gmp/mpn/x86/k7/gmp-mparam.h100
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/copyd.asm136
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/copyi.asm147
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/divrem_1.asm718
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/lshift.asm472
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/mod_1.asm457
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/popham.asm239
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/rshift.asm471
-rw-r--r--rts/gmp/mpn/x86/k7/mul_1.asm265
-rw-r--r--rts/gmp/mpn/x86/k7/mul_basecase.asm593
-rw-r--r--rts/gmp/mpn/x86/k7/sqr_basecase.asm627
-rw-r--r--rts/gmp/mpn/x86/lshift.asm90
-rw-r--r--rts/gmp/mpn/x86/mod_1.asm141
-rw-r--r--rts/gmp/mpn/x86/mul_1.asm130
-rw-r--r--rts/gmp/mpn/x86/mul_basecase.asm209
-rw-r--r--rts/gmp/mpn/x86/p6/README95
-rw-r--r--rts/gmp/mpn/x86/p6/aorsmul_1.asm300
-rw-r--r--rts/gmp/mpn/x86/p6/diveby3.asm37
-rw-r--r--rts/gmp/mpn/x86/p6/gmp-mparam.h96
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/divrem_1.asm677
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/mod_1.asm444
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/popham.asm31
-rw-r--r--rts/gmp/mpn/x86/p6/p3mmx/popham.asm30
-rw-r--r--rts/gmp/mpn/x86/p6/sqr_basecase.asm641
-rw-r--r--rts/gmp/mpn/x86/pentium/README77
-rw-r--r--rts/gmp/mpn/x86/pentium/aors_n.asm196
-rw-r--r--rts/gmp/mpn/x86/pentium/aorsmul_1.asm99
-rw-r--r--rts/gmp/mpn/x86/pentium/diveby3.asm183
-rw-r--r--rts/gmp/mpn/x86/pentium/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/lshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/lshift.asm455
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/popham.asm30
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/rshift.asm460
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_1.asm79
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_basecase.asm135
-rw-r--r--rts/gmp/mpn/x86/pentium/rshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/sqr_basecase.asm520
-rw-r--r--rts/gmp/mpn/x86/rshift.asm92
-rw-r--r--rts/gmp/mpn/x86/udiv.asm44
-rw-r--r--rts/gmp/mpn/x86/umul.asm43
-rw-r--r--rts/gmp/mpn/x86/x86-defs.m4713
-rw-r--r--rts/gmp/mpn/z8000/add_n.s53
-rw-r--r--rts/gmp/mpn/z8000/gmp-mparam.h27
-rw-r--r--rts/gmp/mpn/z8000/mul_1.s68
-rw-r--r--rts/gmp/mpn/z8000/sub_n.s54
-rw-r--r--rts/gmp/mpn/z8000x/add_n.s56
-rw-r--r--rts/gmp/mpn/z8000x/sub_n.s56
347 files changed, 50022 insertions, 0 deletions
diff --git a/rts/gmp/mpn/Makefile.am b/rts/gmp/mpn/Makefile.am
new file mode 100644
index 0000000000..1c49ccda25
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.am
@@ -0,0 +1,94 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS = tests
+
+CPP = @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+ mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+ sparc64 thumb vax x86 z8000 z8000x
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC. FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS = \
+ $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+SUFFIXES = .s .S .asm
+
+# *.s are not preprocessed at all.
+.s.o:
+ $(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+ $(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+ $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.S.obj:
+ $(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor. The overhead cost is one extra assemblation. FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+ $(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+ rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+ $(M4) -DOPERATION_$* $< >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.asm.obj:
+ $(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.asm.lo:
+ $(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ $(M4) -DOPERATION_$* $< >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+ rm -f tmp-$*.s
diff --git a/rts/gmp/mpn/Makefile.in b/rts/gmp/mpn/Makefile.in
new file mode 100644
index 0000000000..59ee958c92
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.in
@@ -0,0 +1,472 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS =
+
+CPP = @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+ mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+ sparc64 thumb vax x86 z8000 z8000x
+
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC. FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS = \
+ $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+
+SUFFIXES = .s .S .asm
+subdir = mpn
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpn_la_LDFLAGS =
+am_libmpn_la_OBJECTS = mp_bases.lo
+libmpn_la_OBJECTS = $(am_libmpn_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES = $(libmpn_la_SOURCES)
+DIST_COMMON = README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp =
+SOURCES = $(libmpn_la_SOURCES)
+OBJECTS = $(am_libmpn_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .S .asm .c .lo .o .obj .s
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4)
+ cd $(top_srcdir) && $(AUTOMAKE) --gnu mpn/Makefile
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ cd $(top_builddir) \
+ && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+ -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+ -rm -f *.o core *.core
+ -rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+ -rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES)
+ $(LINK) $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+.c.o:
+ $(COMPILE) -c $<
+.c.obj:
+ $(COMPILE) -c `cygpath -w $<`
+.c.lo:
+ $(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+# (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+ @set fnord $(MAKEFLAGS); amf=$$2; \
+ dot_seen=no; \
+ target=`echo $@ | sed s/-recursive//`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ echo "Making $$target in $$subdir"; \
+ if test "$$subdir" = "."; then \
+ dot_seen=yes; \
+ local_target="$$target-am"; \
+ else \
+ local_target="$$target"; \
+ fi; \
+ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+ done; \
+ if test "$$dot_seen" = "no"; then \
+ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+ fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+ @set fnord $(MAKEFLAGS); amf=$$2; \
+ dot_seen=no; \
+ rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+ rev="$$subdir $$rev"; \
+ if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+ done; \
+ test "$$dot_seen" = "no" && rev=". $$rev"; \
+ target=`echo $@ | sed s/-recursive//`; \
+ for subdir in $$rev; do \
+ echo "Making $$target in $$subdir"; \
+ if test "$$subdir" = "."; then \
+ local_target="$$target-am"; \
+ else \
+ local_target="$$target"; \
+ fi; \
+ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+ done && test -z "$$fail"
+tags-recursive:
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+ done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+ list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ tags=; \
+ here=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ if test "$$subdir" = .; then :; else \
+ test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+ fi; \
+ done; \
+ list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+ || etags $(ETAGS_ARGS) $$tags $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+ -rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+ @for file in $(DISTFILES); do \
+ d=$(srcdir); \
+ if test -d $$d/$$file; then \
+ cp -pR $$d/$$file $(distdir); \
+ else \
+ test -f $(distdir)/$$file \
+ || cp -p $$d/$$file $(distdir)/$$file || :; \
+ fi; \
+ done
+ for subdir in $(SUBDIRS); do \
+ if test "$$subdir" = .; then :; else \
+ test -d $(distdir)/$$subdir \
+ || mkdir $(distdir)/$$subdir \
+ || exit 1; \
+ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+ || exit 1; \
+ fi; \
+ done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -rm -f Makefile $(CONFIG_CLEAN_FILES)
+ -rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+ -rm -f Makefile.in
+mostlyclean-am: mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+ mostlyclean-libtool mostlyclean-tags \
+ mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am: clean-noinstLTLIBRARIES clean-compile clean-libtool \
+ clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am: distclean-noinstLTLIBRARIES distclean-compile \
+ distclean-libtool distclean-tags distclean-generic \
+ clean-am
+ -rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am: maintainer-clean-noinstLTLIBRARIES \
+ maintainer-clean-compile maintainer-clean-libtool \
+ maintainer-clean-tags maintainer-clean-generic \
+ distclean-am
+ @echo "This command is intended for maintainers to use;"
+ @echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# *.s are not preprocessed at all.
+.s.o:
+ $(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+ $(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+ $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.S.obj:
+ $(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor. The overhead cost is one extra assemblation. FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+ $(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+ rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+ $(M4) -DOPERATION_$* $< >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.asm.obj:
+ $(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ rm -f tmp-$*.s
+.asm.lo:
+ $(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+ $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+ $(M4) -DOPERATION_$* $< >tmp-$*.s
+ $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+ rm -f tmp-$*.s
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpn/README b/rts/gmp/mpn/README
new file mode 100644
index 0000000000..7453c9d03e
--- /dev/null
+++ b/rts/gmp/mpn/README
@@ -0,0 +1,13 @@
+This directory contains all code for the mpn layer of GMP.
+
+Most subdirectories contain machine-dependent code, written in assembly or C.
+The `generic' subdirectory contains default code, used when there is no
+machine-dependent replacement for a particular machine.
+
+There is one subdirectory for each ISA family. Note that e.g., 32-bit SPARC
+and 64-bit SPARC are very different ISA's, and thus cannot share any code.
+
+A particular compile will only use code from one subdirectory, and the
+`generic' subdirectory. The ISA-specific subdirectories contain hierachies of
+directories for various architecture variants and implementations; the
+top-most level contains code that runs correctly on all variants.
diff --git a/rts/gmp/mpn/a29k/add_n.s b/rts/gmp/mpn/a29k/add_n.s
new file mode 100644
index 0000000000..e3ee6dfa60
--- /dev/null
+++ b/rts/gmp/mpn/a29k/add_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers. gr116 is used for this purpose. Note that
+; gr116==0 means that carry should be set.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_add_n
+ .word 0x60000
+___gmpn_add_n:
+ srl gr117,lr5,3
+ sub gr118,gr117,1
+ jmpt gr118,Ltail
+ constn gr116,-1 ; init cy reg
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr104,lr4
+ add lr4,lr4,32
+
+ subr gr116,gr116,0 ; restore carry
+ addc gr96,gr96,gr104
+ addc gr97,gr97,gr105
+ addc gr98,gr98,gr106
+ addc gr99,gr99,gr107
+ addc gr100,gr100,gr108
+ addc gr101,gr101,gr109
+ addc gr102,gr102,gr110
+ addc gr103,gr103,gr111
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+ and lr5,lr5,(8-1)
+Ltail: sub gr118,lr5,1 ; count for CR
+ jmpt gr118,Lend
+ sub gr117,lr5,2 ; count for jmpfdec
+
+ mtsr cr,gr118
+ loadm 0,0,gr96,lr3
+ mtsr cr,gr118
+ loadm 0,0,gr104,lr4
+
+ subr gr116,gr116,0 ; restore carry
+
+ jmpfdec gr117,L1
+ addc gr96,gr96,gr104
+ jmp Lstore
+ mtsr cr,gr118
+L1: jmpfdec gr117,L2
+ addc gr97,gr97,gr105
+ jmp Lstore
+ mtsr cr,gr118
+L2: jmpfdec gr117,L3
+ addc gr98,gr98,gr106
+ jmp Lstore
+ mtsr cr,gr118
+L3: jmpfdec gr117,L4
+ addc gr99,gr99,gr107
+ jmp Lstore
+ mtsr cr,gr118
+L4: jmpfdec gr117,L5
+ addc gr100,gr100,gr108
+ jmp Lstore
+ mtsr cr,gr118
+L5: jmpfdec gr117,L6
+ addc gr101,gr101,gr109
+ jmp Lstore
+ mtsr cr,gr118
+L6: addc gr102,gr102,gr110
+
+Lstore: storem 0,0,gr96,lr2
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+Lend: jmpi lr0
+ add gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/addmul_1.s b/rts/gmp/mpn/a29k/addmul_1.s
new file mode 100644
index 0000000000..f51b6d7af6
--- /dev/null
+++ b/rts/gmp/mpn/a29k/addmul_1.s
@@ -0,0 +1,113 @@
+; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; add the product to a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_addmul_1
+ .word 0x60000
+___gmpn_addmul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr2
+
+ add gr104,gr96,gr104
+ addc gr105,gr97,gr105
+ addc gr106,gr98,gr106
+ addc gr107,gr99,gr107
+ addc gr108,gr100,gr108
+ addc gr109,gr101,gr109
+ addc gr110,gr102,gr110
+ addc gr111,gr103,gr111
+ addc gr120,gr120,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr104,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ load 0,0,gr119,lr2
+ add gr117,gr117,gr120
+ addc gr118,gr118,0
+ add gr117,gr117,gr119
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/rts/gmp/mpn/a29k/lshift.s b/rts/gmp/mpn/a29k/lshift.s
new file mode 100644
index 0000000000..93e1917127
--- /dev/null
+++ b/rts/gmp/mpn/a29k/lshift.s
@@ -0,0 +1,93 @@
+; 29000 __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_lshift
+ .word 0x60000
+___gmpn_lshift:
+ sll gr116,lr4,2
+ add lr3,gr116,lr3
+ add lr2,gr116,lr2
+ sub lr3,lr3,4
+ load 0,0,gr119,lr3
+
+ subr gr116,lr5,32
+ srl gr96,gr119,gr116 ; return value
+ sub lr4,lr4,1 ; actual loop count is SIZE - 1
+
+ srl gr117,lr4,3 ; chuck count = (actual count) / 8
+ cpeq gr118,gr117,0
+ jmpt gr118,Ltail
+ mtsr fc,lr5
+
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: sub lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr100,lr3
+
+ extract gr109,gr119,gr107
+ extract gr108,gr107,gr106
+ extract gr107,gr106,gr105
+ extract gr106,gr105,gr104
+ extract gr105,gr104,gr103
+ extract gr104,gr103,gr102
+ extract gr103,gr102,gr101
+ extract gr102,gr101,gr100
+
+ sub lr2,lr2,32
+ mtsrim cr,(8-1)
+ storem 0,0,gr102,lr2
+ jmpfdec gr117,Loop
+ or gr119,gr100,0
+
+; Code for the last up-to-7 limbs.
+
+ and lr4,lr4,(8-1)
+Ltail: cpeq gr118,lr4,0
+ jmpt gr118,Lend
+ sub lr4,lr4,2 ; count for jmpfdec
+
+Loop2: sub lr3,lr3,4
+ load 0,0,gr116,lr3
+ extract gr117,gr119,gr116
+ sub lr2,lr2,4
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ or gr119,gr116,0
+
+Lend: extract gr117,gr119,0
+ sub lr2,lr2,4
+ jmpi lr0
+ store 0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/mul_1.s b/rts/gmp/mpn/a29k/mul_1.s
new file mode 100644
index 0000000000..6bcf7ce0cf
--- /dev/null
+++ b/rts/gmp/mpn/a29k/mul_1.s
@@ -0,0 +1,97 @@
+; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_mul_1
+ .word 0x60000
+___gmpn_mul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr104,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ add gr117,gr117,gr120
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/rts/gmp/mpn/a29k/rshift.s b/rts/gmp/mpn/a29k/rshift.s
new file mode 100644
index 0000000000..ea163bff2b
--- /dev/null
+++ b/rts/gmp/mpn/a29k/rshift.s
@@ -0,0 +1,89 @@
+; 29000 __gmpn_rshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_rshift
+ .word 0x60000
+___gmpn_rshift:
+ load 0,0,gr119,lr3
+ add lr3,lr3,4
+
+ subr gr116,lr5,32
+ sll gr96,gr119,gr116 ; return value
+ sub lr4,lr4,1 ; actual loop count is SIZE - 1
+
+ srl gr117,lr4,3 ; chuck count = (actual count) / 8
+ cpeq gr118,gr117,0
+ jmpt gr118,Ltail
+ mtsr fc,gr116
+
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr100,lr3
+ add lr3,lr3,32
+
+ extract gr98,gr100,gr119
+ extract gr99,gr101,gr100
+ extract gr100,gr102,gr101
+ extract gr101,gr103,gr102
+ extract gr102,gr104,gr103
+ extract gr103,gr105,gr104
+ extract gr104,gr106,gr105
+ extract gr105,gr107,gr106
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr98,lr2
+ add lr2,lr2,32
+ jmpfdec gr117,Loop
+ or gr119,gr107,0
+
+; Code for the last up-to-7 limbs.
+
+ and lr4,lr4,(8-1)
+Ltail: cpeq gr118,lr4,0
+ jmpt gr118,Lend
+ sub lr4,lr4,2 ; count for jmpfdec
+
+Loop2: load 0,0,gr100,lr3
+ add lr3,lr3,4
+ extract gr117,gr100,gr119
+ store 0,0,gr117,lr2
+ add lr2,lr2,4
+ jmpfdec lr4,Loop2
+ or gr119,gr100,0
+
+Lend: srl gr117,gr119,lr5
+ jmpi lr0
+ store 0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/sub_n.s b/rts/gmp/mpn/a29k/sub_n.s
new file mode 100644
index 0000000000..c6b64c5bee
--- /dev/null
+++ b/rts/gmp/mpn/a29k/sub_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers. gr116 is used for this purpose. Note that
+; gr116==0 means that carry should be set.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_sub_n
+ .word 0x60000
+___gmpn_sub_n:
+ srl gr117,lr5,3
+ sub gr118,gr117,1
+ jmpt gr118,Ltail
+ constn gr116,-1 ; init cy reg
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr104,lr4
+ add lr4,lr4,32
+
+ subr gr116,gr116,0 ; restore carry
+ subc gr96,gr96,gr104
+ subc gr97,gr97,gr105
+ subc gr98,gr98,gr106
+ subc gr99,gr99,gr107
+ subc gr100,gr100,gr108
+ subc gr101,gr101,gr109
+ subc gr102,gr102,gr110
+ subc gr103,gr103,gr111
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+ and lr5,lr5,(8-1)
+Ltail: sub gr118,lr5,1 ; count for CR
+ jmpt gr118,Lend
+ sub gr117,lr5,2 ; count for jmpfdec
+
+ mtsr cr,gr118
+ loadm 0,0,gr96,lr3
+ mtsr cr,gr118
+ loadm 0,0,gr104,lr4
+
+ subr gr116,gr116,0 ; restore carry
+
+ jmpfdec gr117,L1
+ subc gr96,gr96,gr104
+ jmp Lstore
+ mtsr cr,gr118
+L1: jmpfdec gr117,L2
+ subc gr97,gr97,gr105
+ jmp Lstore
+ mtsr cr,gr118
+L2: jmpfdec gr117,L3
+ subc gr98,gr98,gr106
+ jmp Lstore
+ mtsr cr,gr118
+L3: jmpfdec gr117,L4
+ subc gr99,gr99,gr107
+ jmp Lstore
+ mtsr cr,gr118
+L4: jmpfdec gr117,L5
+ subc gr100,gr100,gr108
+ jmp Lstore
+ mtsr cr,gr118
+L5: jmpfdec gr117,L6
+ subc gr101,gr101,gr109
+ jmp Lstore
+ mtsr cr,gr118
+L6: subc gr102,gr102,gr110
+
+Lstore: storem 0,0,gr96,lr2
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+Lend: jmpi lr0
+ add gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/submul_1.s b/rts/gmp/mpn/a29k/submul_1.s
new file mode 100644
index 0000000000..ef97d8d4e5
--- /dev/null
+++ b/rts/gmp/mpn/a29k/submul_1.s
@@ -0,0 +1,116 @@
+; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+; subtract the product from a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_submul_1
+ .word 0x60000
+___gmpn_submul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr2
+
+ sub gr96,gr96,gr104
+ subc gr97,gr97,gr105
+ subc gr98,gr98,gr106
+ subc gr99,gr99,gr107
+ subc gr100,gr100,gr108
+ subc gr101,gr101,gr109
+ subc gr102,gr102,gr110
+ subc gr103,gr103,gr111
+
+ add gr104,gr103,gr111 ; invert carry from previus sub
+ addc gr120,gr120,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ load 0,0,gr119,lr2
+ add gr117,gr117,gr120
+ addc gr118,gr118,0
+ sub gr119,gr119,gr117
+ add gr104,gr119,gr117 ; invert carry from previus sub
+ store 0,0,gr119,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/rts/gmp/mpn/a29k/udiv.s b/rts/gmp/mpn/a29k/udiv.s
new file mode 100644
index 0000000000..fdd53a9a88
--- /dev/null
+++ b/rts/gmp/mpn/a29k/udiv.s
@@ -0,0 +1,30 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___udiv_qrnnd
+ .word 0x60000
+___udiv_qrnnd:
+ mtsr q,lr3
+ dividu gr96,lr4,lr5
+ mfsr gr116,q
+ jmpi lr0
+ store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/a29k/umul.s b/rts/gmp/mpn/a29k/umul.s
new file mode 100644
index 0000000000..7741981167
--- /dev/null
+++ b/rts/gmp/mpn/a29k/umul.s
@@ -0,0 +1,29 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___umul_ppmm
+ .word 0x50000
+___umul_ppmm:
+ multiplu gr116,lr3,lr4
+ multmu gr96,lr3,lr4
+ jmpi lr0
+ store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..744260c7c5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/README
@@ -0,0 +1,224 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding. It also says whether $27 is really
+needed for computing the gp. The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth. The on-chip L1 cache is
+ write-through, and a cache line is transfered from the store buffer to
+ the off-chip L2 in as much 15 cycles on most systems. This delay hurts
+ mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+ instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of
+ these cycles are pipelined. Thus, multiply instructions can be issued at
+ a rate of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+ stores. Even when the working set is larger than the on-chip L1 and L2
+ caches, the performance remain almost unaffected.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+ umulh has a measured latency of 14 cycles and an issue rate of 1 each
+ 10th cycle. But the exact timing is somewhat confusing.
+
+3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12
+ are memory operations. This will take at least
+ ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+ We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+ cache cycles, which should be completely hidden in the 19 issue cycles.
+ The computation is inherently serial, with these dependencies:
+
+ ldq ldq
+ \ /\
+ (or) addq |
+ |\ / \ |
+ | addq cmpult
+ \ | |
+ cmpult |
+ \ /
+ or
+
+ I.e., 3 operations are needed between carry-in and carry-out, making 12
+ cycles the absolute minimum for the 4 limbs. We could replace the `or'
+ with a cmoveq/cmovne, which could issue one cycle earlier that the `or',
+ but that might waste a cycle on EV4. The total depth remain unaffected,
+ since cmov has a latency of 2 cycles.
+
+ addq
+ / \
+ addq cmpult
+ | \
+ cmpult -> cmovne
+
+Montgomery has a slightly different way of computing carry that requires one
+less instruction, but has depth 4 (instead of the current 3). Since the
+code is currently instruction issue bound, Montgomery's idea should save us
+1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25
+cycles/limb. Unfortunately, this method will not be good for the EV6.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle. One integer multiply instruction can issue each
+cycle. To get optimal speed, we need to pretend we are vectorizing the code,
+i.e., minimize the iterative dependencies.
+
+There are two dependencies to watch out for. 1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling the loop, so
+that addresses don't depend heavily on an index variable. Avoiding
+serializing because of carry propagation is trickier; the ultimate performance
+of the code will be determined of the number of latency cycles it takes from
+accepting carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines. Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but
+the execute efficiently. But CMOV split the mapping process (see pg 2-26 in
+cmpwrgd.pdf), suggesting the CMOV should always be placed as the last
+instruction of an aligned 4 instruction block (?).
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency
+for consumers in the opposite cluster. Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+The shift loops need (per limb):
+ 1 load (Lx pipes)
+ 1 store (Lx pipes)
+ 2 shift (Ux pipes)
+ 1 iaddlog (Lx pipes, Ux pipes)
+Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb.
+
+For mpn_add_n, we currently have
+ 2 load (Lx pipes)
+ 1 store (Lx pipes)
+ 5 iaddlog (Lx pipes, Ux pipes)
+
+Again, we have a perfect balance and will be limited by carry propagation
+delays, currently three cycles. The superoptimizer indicates that ther
+might be sequences that--using a final cmov--have a carry propagation delay
+of just two. Montgomery's subtraction sequence could perhaps be used, by
+complementing some operands. All in all, we should get down to 2 cycles
+without much problems.
+
+For mpn_mul_1, we could do, just like for mpn_add_n:
+ not newlo,notnewlo
+ addq cylimb,newlo,newlo || cmpult cylimb,notnewlo,cyout
+ addq cyout,newhi,cylimb
+and get 2-cycle carry propagation. The instructions needed will be
+ 1 ld (Lx pipes)
+ 1 st (Lx pipes)
+ 2 mul (U1 pipe)
+ 4 iaddlog (Lx pipes, Ux pipes)
+issue1: addq not mul ld
+issue2: cmpult addq mul st
+Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb!
+
+Last, we have mpn_addmul_1. Almost certainly, we will get down to 3
+cycles/limb, which would be absolutely awesome.
+
+Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen):
+
+ i
+ s
+ s i
+ u n
+ e s
+ d t
+ r
+ i u
+l n c
+i s t
+v t i
+e r o
+ u n
+v c
+a t t
+l i y
+u o p
+e n e
+s s s
+ issue
+ in
+ cycle
+ -1 ldq
+ / \
+ 0 | \
+ | \
+ 1 | |
+ | |
+ 2 | | ldq
+ | | / \
+ 3 | mulq | \
+ | \ | \
+ 4 umulh \ | |
+ | | | |
+ 5 | | | | ldq
+ | | | | / \
+ 4calm 6 | | ldq | mulq | \
+ | | / | \ | \
+ 4casm 7 | | / umulh \ | |
+6 | || | | | |
+ 3aal 8 | || | | | | ldq
+7 | || | | | | / \
+ 4calm 9 | || | | ldq | mulq | \
+9 | || | | / | \ | \
+ 4casm 10 | || | | / umulh \ | |
+9 | || | || | | | |
+ 3aal 11 | addq | || | | | | ldq
+9 | // \ | || | | | | / \
+ 4calm 12 \ cmpult addq<-cy | || | | ldq | mulq | \
+13 \ / // \ | || | | / | \ | \
+ 4casm 13 addq cmpult stq | || | | / umulh \ | |
+11 \ / | || | || | | | |
+ 3aal 14 addq | addq | || | | | | ldq
+10 \ | // \ | || | | | | / \
+ 4calm 15 cy ----> \ cmpult addq<-cy | || | | ldq | mulq | \
+13 \ / // \ | || | | / | \ | \
+ 4casm 16 addq cmpult stq | || | | / umulh \ | |
+11 \ / | || | || | | | |
+ 3aal 17 addq | addq | || | | | |
+10 \ | // \ | || | | | |
+ 4calm 18 cy ----> \ cmpult addq<-cy | || | | ldq | mulq
+13 \ / // \ | || | | / | \
+ 4casm 19 addq cmpult stq | || | | / umulh \
+11 \ / | || | || | |
+ 3aal 20 addq | addq | || | |
+10 \ | // \ | || | |
+ 4calm 21 cy ----> \ cmpult addq<-cy | || | | ldq
+ \ / // \ | || | | /
+ 22 addq cmpult stq | || | | /
+ \ / | || | ||
+ 23 addq | addq | ||
+ \ | // \ | ||
+ 24 cy ----> \ cmpult addq<-cy | ||
+ \ / // \ | ||
+ 25 addq cmpult stq | ||
+ \ / | ||
+ 26 addq | addq
+ \ | // \
+ 27 cy ----> \ cmpult addq<-cy
+ \ / // \
+ 28 addq cmpult stq
+ \ /
+As many as 6 consecutive points will be under execution simultaneously, or if we addq
+schedule loads even further away, maybe 7 or 8. But the number of live quantities \
+is reasonable, and can easily be satisfied. cy ---->
diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..08d6a9f7b8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..4ea900be6b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ addq r5,r3,r3
+ cmpult r3,r5,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..febb3b70d9
--- /dev/null
+++ b/rts/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,68 @@
+dnl Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl DISCUSSION:
+
+dnl Other methods have been tried, and using a 128-entry table actually trims
+dnl about 10% of the execution time (on a 21164) when the table is in the L1
+dnl cache. But under non-benchmarking conditions, the table will hardly be in
+dnl the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables
+dnl are also possible, but they require many more instructions than the current
+dnl code. (But for count_trailing_zeros, such tricks are beneficial.)
+dnl Finally, converting to floating-point and extracting the exponent is much
+dnl slower.
+
+ASM_START()
+PROLOGUE(MPN(count_leading_zeros))
+ bis r31,63,r0 C initialize partial result count
+
+ srl r16,32,r1 C shift down 32 steps -> r1
+ cmovne r1,r1,r16 C select r1 if non-zero
+ cmovne r1,31,r0 C if r1 is nonzero choose smaller count
+
+ srl r16,16,r1 C shift down 16 steps -> r1
+ subq r0,16,r2 C generate new partial result count
+ cmovne r1,r1,r16 C choose new r1 if non-zero
+ cmovne r1,r2,r0 C choose new count if r1 was non-zero
+
+ srl r16,8,r1
+ subq r0,8,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,4,r1
+ subq r0,4,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,2,r1
+ subq r0,2,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,1,r1 C extract bit 1
+ subq r0,r1,r0 C subtract it from partial result
+
+ ret r31,(r26),1
+EPILOGUE(MPN(count_leading_zeros))
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..5f4c48dc73
--- /dev/null
+++ b/rts/gmp/mpn/alpha/default.m4
@@ -0,0 +1,77 @@
+divert(-1)
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+ `
+ .set noreorder
+ .set noat')
+
+define(`X',`0x$1')
+define(`FLOAT64',
+ `
+ .align 3
+$1: .t_floating $2')
+
+define(`PROLOGUE',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ .frame r30,0,r26
+ .prologue 0')
+
+define(`PROLOGUE_GP',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ ldgp r29,0(r27)
+ .frame r30,0,r26
+ .prologue 1')
+
+define(`EPILOGUE',
+ `
+ .end $1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+ `dnl
+ DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..716d6404ae
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ addq r0,r4,r20 C 1st main add
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r20,r0,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ addq r1,r5,r28 C 2nd main add
+ addq r18,32,r18 C update s2_ptr
+ addq r28,r25,r21 C 2nd carry add
+ cmpult r28,r5,r8 C compute cy from last add
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r21,r28,r25 C compute cy from last add
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two adds
+ ldq r1,8(r18)
+ addq r2,r6,r28 C 3rd main add
+ ldq r4,0(r17)
+ addq r28,r25,r22 C 3rd carry add
+ ldq r5,8(r17)
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ addq r1,r5,r28 C 2nd main add
+ stq r23,-8(r16)
+ addq r25,r28,r21 C 2nd carry add
+ addq r18,32,r18 C update s2_ptr
+ cmpult r28,r5,r8 C compute cy from last add
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r21,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r2,r6,r28 C 3rd main add
+ addq r28,r25,r22 C 3rd carry add
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: addq r0,r4,r28 C main add
+ ldq r0,8(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r4,8(r17)
+ addq r28,r25,r20 C carry add
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r20,r28,r25 C compute cy from last add
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: addq r0,r4,r28 C main add
+ addq r28,r25,r20 C carry add
+ cmpult r28,r4,r8 C compute cy from last add
+ cmpult r20,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..cb181dda66
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl Alpha EV5 __gmpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r31,r19,r20
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ srl r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,-16(r17)
+ subq r16,8,r16
+ sll r4,r19,r5
+ subq r17,8,r17
+ subq r28,1,r28
+ srl r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r28,$Loop0
+
+$L0: sll r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,-16(r17)
+ subq r18,4,r18
+ ldq r2,-24(r17)
+ ldq r3,-32(r17)
+ ldq r4,-40(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ ldq r1,-48(r17)
+ sll r2,r19,r22
+ ldq r2,-56(r17)
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ ldq r3,-64(r17)
+ sll r4,r19,r24
+ ldq r4,-72(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+
+ srl r1,r20,r7
+ subq r18,4,r18
+ sll r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ srl r2,r20,r8
+ ldq r1,-80(r17)
+ sll r2,r19,r22
+ ldq r2,-88(r17)
+
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+
+ srl r3,r20,r5
+ unop C ldq r31,-96(r17)
+ sll r3,r19,r23
+ subq r16,32,r16
+
+ srl r4,r20,r6
+ ldq r3,-96(r17)
+ sll r4,r19,r24
+ ldq r4,-104(r17)
+
+ subq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+ srl r3,r20,r5
+ sll r3,r19,r23
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 2/2
+ stq r7,-40(r16)
+ bis r5,r22,r5
+ stq r8,-48(r16)
+ bis r6,r23,r6
+ stq r5,-56(r16)
+ stq r6,-64(r16)
+C cool down phase 2/3
+ stq r24,-72(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 1/2
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ stq r5,-24(r16)
+ stq r6,-32(r16)
+ stq r24,-40(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..9940d83fad
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl Alpha EV5 __gmpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ subq r31,r19,r20
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ sll r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,8(r17)
+ addq r16,8,r16
+ srl r4,r19,r5
+ addq r17,8,r17
+ subq r28,1,r28
+ sll r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r28,$Loop0
+
+$L0: srl r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,8(r17)
+ subq r18,4,r18
+ ldq r2,16(r17)
+ ldq r3,24(r17)
+ ldq r4,32(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ ldq r1,40(r17)
+ srl r2,r19,r22
+ ldq r2,48(r17)
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ ldq r3,56(r17)
+ srl r4,r19,r24
+ ldq r4,64(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+
+ sll r1,r20,r7
+ subq r18,4,r18
+ srl r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ sll r2,r20,r8
+ ldq r1,72(r17)
+ srl r2,r19,r22
+ ldq r2,80(r17)
+
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+
+ sll r3,r20,r5
+ unop C ldq r31,-96(r17)
+ srl r3,r19,r23
+ addq r16,32,r16
+
+ sll r4,r20,r6
+ ldq r3,88(r17)
+ srl r4,r19,r24
+ ldq r4,96(r17)
+
+ addq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+ sll r3,r20,r5
+ srl r3,r19,r23
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 2/2
+ stq r7,32(r16)
+ bis r5,r22,r5
+ stq r8,40(r16)
+ bis r6,r23,r6
+ stq r5,48(r16)
+ stq r6,56(r16)
+C cool down phase 2/3
+ stq r24,64(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 1/2
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ stq r5,16(r16)
+ stq r6,24(r16)
+ stq r24,32(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..5248a2aa38
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ subq r4,r0,r20 C 1st main subtract
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r4,r0,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ subq r5,r1,r28 C 2nd main subtract
+ addq r18,32,r18 C update s2_ptr
+ subq r28,r25,r21 C 2nd carry subtract
+ cmpult r5,r1,r8 C compute cy from last subtract
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ ldq r1,8(r18)
+ subq r6,r2,r28 C 3rd main subtract
+ ldq r4,0(r17)
+ subq r28,r25,r22 C 3rd carry subtract
+ ldq r5,8(r17)
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C 4th main subtract
+ subq r28,r25,r23 C 4th carry subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ subq r5,r1,r28 C 2nd main subtract
+ stq r23,-8(r16)
+ subq r28,r25,r21 C 2nd carry subtract
+ addq r18,32,r18 C update s2_ptr
+ cmpult r5,r1,r8 C compute cy from last subtract
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r6,r2,r28 C cy add
+ subq r28,r25,r22 C 3rd main subtract
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C cy add
+ subq r28,r25,r23 C 4th main subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: subq r4,r0,r28 C main subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r0,8(r18)
+ ldq r4,8(r17)
+ subq r28,r25,r20 C carry subtract
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: subq r4,r0,r28 C main subtract
+ subq r28,r25,r20 C carry subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
new file mode 100644
index 0000000000..2f588626a5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
@@ -0,0 +1,474 @@
+dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+dnl exactly 3.625 cycles/limb on EV6...
+
+dnl This code was written in close cooperation with ev6 pipeline expert
+dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
+dnl
+dnl Register usages for unrolled loop:
+dnl 0-3 mul's
+dnl 4-7 acc's
+dnl 8-15 mul results
+dnl 20,21 carry's
+dnl 22,23 save for stores
+
+dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+
+dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl them, so that further disturbance to the schedule is damped.
+
+dnl We couldn't pair the loads, because the entangled schedule of the
+dnl carry's has to happen on one side {0} of the machine. Note, the total
+dnl use of U0, and the total use of L0 (after attending to the stores).
+dnl which is part of the reason why....
+
+dnl This is a great schedule for the d_cache, a poor schedule for the
+dnl b_cache. The lockup on U0 means that any stall can't be recovered
+dnl from. Consider a ldq in L1. say that load gets stalled because it
+dnl collides with a fill from the b_Cache. On the next cycle, this load
+dnl gets priority. If first looks at L0, and goes there. The instruction
+dnl we intended for L0 gets to look at L1, which is NOT where we want
+dnl it. It either stalls 1, because it can't go in L0, or goes there, and
+dnl causes a further instruction to stall.
+
+dnl So for b_cache, we're likely going to want to put one or more cycles
+dnl back into the code! And, of course, put in prefetches. For the
+dnl accumulator, lds, intent to modify. For the multiplier, you might
+dnl want ldq, evict next, if you're not wanting to use it again soon. Use
+dnl 256 ahead of present pointer value. At a place where we have an mt
+dnl followed by a bookkeeping, put the bookkeeping in upper, and the
+dnl prefetch into lower.
+
+dnl Note, the usage of physical registers per cycle is smoothed off, as
+dnl much as possible.
+
+dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
+dnl like not to have a ldq or stq to preceded a conditional branch in a
+dnl quadpack. The conditional branch moves the retire pointer one cycle
+dnl later.
+
+dnl Optimization notes:
+dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
+dnl Reserved regs: r29 r30 r31
+dnl Free caller-saves regs in unrolled code: r24 r25 r28
+dnl We should swap some of the callee-saves regs for some of the free
+dnl caller-saves regs, saving some overhead cycles.
+dnl Most importantly, we should write fast code for the 0-7 case.
+dnl The code we use there are for the 21164, and runs at 7 cycles/limb
+dnl on the 21264. Should not be hard, if we write specialized code for
+dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+dnl need a jump table indexed by the low 3 bits of the count argument.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ cmpult r18, 8, r1
+ beq r1, $Large
+
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r18, 1, r18 C size--
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ umulh r2, r19, r0 C r0 = prod_high
+ beq r18, $Lend0b C jump if size was == 1
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r18, 1, r18 C size--
+ addq r5, r3, r3
+ cmpult r3, r5, r4
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ beq r18, $Lend0a C jump if size was == 2
+
+ ALIGN(8)
+$Loop0: mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ subq r18, 1, r18 C size--
+ umulh r2, r19, r4 C r4 = cy_limb
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ bne r18, $Loop0
+$Lend0a:
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ umulh r2, r19, r4 C r4 = cy_limb
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r5, r0, r0 C combine carries
+ addq r4, r0, r0 C cy_limb = prod_high + cy
+ ret r31, (r26), 1
+$Lend0b:
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r0, r5, r0
+ ret r31, (r26), 1
+
+$Large:
+ lda $30, -240($30)
+ stq $9, 8($30)
+ stq $10, 16($30)
+ stq $11, 24($30)
+ stq $12, 32($30)
+ stq $13, 40($30)
+ stq $14, 48($30)
+ stq $15, 56($30)
+
+ and r18, 7, r20 C count for the first loop, 0-7
+ srl r18, 3, r18 C count for unrolled loop
+ bis r31, r31, r0
+ beq r20, $Lunroll
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r20, 1, r20 C size--
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ umulh r2, r19, r0 C r0 = prod_high
+ beq r20, $Lend1b C jump if size was == 1
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r20, 1, r20 C size--
+ addq r5, r3, r3
+ cmpult r3, r5, r4
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ beq r20, $Lend1a C jump if size was == 2
+
+ ALIGN(8)
+$Loop1: mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ subq r20, 1, r20 C size--
+ umulh r2, r19, r4 C r4 = cy_limb
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ bne r20, $Loop1
+
+$Lend1a:
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ umulh r2, r19, r4 C r4 = cy_limb
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ addq r4, r0, r0 C cy_limb = prod_high + cy
+ br r31, $Lunroll
+$Lend1b:
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r0, r5, r0
+
+$Lunroll:
+ lda r17, -16(r17) C L1 bookkeeping
+ lda r16, -16(r16) C L1 bookkeeping
+ bis r0, r31, r12
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldq r2, 16(r17) C L1
+ ldq r3, 24(r17) C L1
+ lda r18, -1(r18) C L1 bookkeeping
+ ldq r6, 16(r16) C L1
+ ldq r7, 24(r16) C L1
+ ldq r0, 32(r17) C L1
+ mulq r19, r2, r13 C U1
+ ldq r1, 40(r17) C L1
+ umulh r19, r2, r14 C U1
+ mulq r19, r3, r15 C U1
+ lda r17, 64(r17) C L1 bookkeeping
+ ldq r4, 32(r16) C L1
+ ldq r5, 40(r16) C L1
+ umulh r19, r3, r8 C U1
+ ldq r2, -16(r17) C L1
+ mulq r19, r0, r9 C U1
+ ldq r3, -8(r17) C L1
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ mulq r19, r1, r11 C U1
+ cmpult r6, r13, r20 C L0 lo add => carry
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, -16(r16) C L1
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, -8(r16) C L1
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 0(r17) C L1
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 8(r17) C L1
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C L0 lo + acc
+ stq r22, -48(r16) C L0
+ stq r23, -40(r16) C L1
+ mulq r19, r3, r15 C U1
+ addq r8, r21, r8 C U0 hi mul + carry
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ ble r18, $Lend C U1 bookkeeping
+
+C ____ MAIN UNROLLED LOOP ____
+ ALIGN(16)
+$Loop:
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 0(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 8(r16) C L1
+
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ ldq r2, 16(r17) C L1
+
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ ldq r3, 24(r17) C L1
+
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, -32(r16) C L0
+ stq r23, -24(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r1, r11 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r12, r21, r12 C U0 hi mul + carry
+
+ cmpult r6, r13, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r18, -1(r18) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, 16(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, 24(r16) C L1
+
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 32(r17) C L1
+
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 40(r17) C L1
+
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C U0 lo + acc
+ stq r22, -16(r16) C L0
+ stq r23, -8(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r3, r15 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C L0 hi mul + carry
+
+ cmpult r4, r9, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r17, 64(r17) C L1 bookkeeping
+ addq r4, r8, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 32(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 40(r16) C L1
+
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ ldq r2, -16(r17) C L1
+
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ ldq r3, -8(r17) C L1
+
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, 0(r16) C L0
+ stq r23, 8(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r1, r11 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r12, r21, r12 C U0 hi mul + carry
+
+ cmpult r6, r13, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, -16(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, -8(r16) C L1
+
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 0(r17) C L1
+
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 8(r17) C L1
+
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C L0 lo + acc
+ stq r22, -48(r16) C L0
+ stq r23, -40(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r3, r15 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ bis r31, r31, r31 C L1 mt
+ bgt r18, $Loop C U1 bookkeeping
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 0(r16) C L1
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 8(r16) C L1
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, -32(r16) C L0
+ stq r23, -24(r16) C L1
+ mulq r19, r1, r11 C U1
+ addq r12, r21, r12 C U0 hi mul + carry
+ cmpult r6, r13, r20 C L0 lo add => carry
+ addq r6, r12, r22 C U0 hi add => answer
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ addq r4, r9, r4 C U0 lo + acc
+ stq r22, -16(r16) C L0
+ stq r23, -8(r16) C L1
+ bis r31, r31, r31 C L0 st slosh
+ addq r8, r21, r8 C L0 hi mul + carry
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ stq r22, 0(r16) C L0
+ stq r23, 8(r16) C L1
+ addq r12, r21, r0 C U0 hi mul + carry
+
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ldq $11, 24($30)
+ ldq $12, 32($30)
+ ldq $13, 40($30)
+ ldq $14, 48($30)
+ ldq $15, 56($30)
+ lda $30, 240($30)
+ ret r31, (r26), 1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..7ea20577f8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-08-02. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 47
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 70
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 94
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 101
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 33
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 70
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 33
+#endif
diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..054ff2fe5f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,64 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 21164 family. The 21264 will require
+ different values, since it has such quick multiplication. */
+/* Generated by tuneup.c, 2000-07-19. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 22
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 53
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 31
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 47
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 64
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 17
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 4
+#endif
diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..a921b32b3f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,345 @@
+dnl Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+dnl
+dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl The ideas are due to Peter L. Montgomery
+dnl
+dnl The table below uses 4096 bytes. The file mentioned above has an
+dnl alternative function that doesn't require the table, but it runs 50%
+dnl slower than this.
+
+include(`../config.m4')
+
+ASM_START()
+
+FLOAT64($C36,9223372036854775808.0) C 2^63
+
+PROLOGUE_GP(mpn_invert_limb)
+ lda r30,-16(r30)
+ addq r16,r16,r1
+ bne r1,$73
+ lda r0,-1
+ br r31,$Lend
+$73:
+ srl r16,1,r1
+ stq r1,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ lda r1,$C36
+ ldt f10,0(r1)
+ divt f10,f1,f10
+ lda r2,$invtab-4096
+ srl r16,52,r1
+ addq r1,r1,r1
+ addq r1,r2,r1
+ bic r1,6,r2
+ ldq r2,0(r2)
+ bic r1,1,r1
+ extwl r2,r1,r2
+ sll r2,48,r0
+ umulh r16,r0,r1
+ addq r16,r1,r3
+ stq r3,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ mult f1,f10,f1
+ cvttqc f1,f1
+ stt f1,0(r30)
+ ldq r4,0(r30)
+ subq r0,r4,r0
+ umulh r16,r0,r1
+ mulq r16,r0,r2
+ addq r16,r1,r3
+ bge r3,$Loop2
+$Loop1: addq r2,r16,r2
+ cmpult r2,r16,r1
+ addq r3,r1,r3
+ addq r0,1,r0
+ blt r3,$Loop1
+$Loop2: cmpult r2,r16,r1
+ subq r0,1,r0
+ subq r3,r1,r3
+ subq r2,r16,r2
+ bge r3,$Loop2
+$Lend:
+ lda r30,16(r30)
+ ret r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART(`$invtab',4)
+ .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+ .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+ .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+ .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+ .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+ .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+ .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+ .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+ .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+ .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+ .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+ .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+ .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+ .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+ .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+ .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+ .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+ .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+ .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+ .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+ .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+ .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+ .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+ .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+ .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+ .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+ .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+ .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+ .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+ .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+ .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+ .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+ .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+ .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+ .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+ .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+ .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+ .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+ .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+ .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+ .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+ .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+ .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+ .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+ .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+ .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+ .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+ .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+ .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+ .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+ .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+ .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+ .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+ .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+ .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+ .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+ .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+ .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+ .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+ .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+ .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+ .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+ .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+ .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+ .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+ .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+ .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+ .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+ .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+ .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+ .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+ .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+ .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+ .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+ .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+ .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+ .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+ .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+ .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+ .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+ .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+ .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+ .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+ .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+ .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+ .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+ .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+ .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+ .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+ .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+ .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+ .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+ .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+ .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+ .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+ .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+ .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+ .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+ .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+ .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+ .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+ .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+ .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+ .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+ .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+ .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+ .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+ .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+ .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+ .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+ .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+ .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+ .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+ .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+ .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+ .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+ .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+ .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+ .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+ .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+ .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+ .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+ .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+ .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+ .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+ .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+ .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+ .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+ .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+ .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+ .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+ .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+ .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+ .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+ .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+ .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+ .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+ .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+ .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+ .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+ .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+ .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+ .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+ .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+ .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+ .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+ .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+ .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+ .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+ .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+ .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+ .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+ .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+ .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+ .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+ .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+ .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+ .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+ .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+ .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+ .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+ .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+ .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+ .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+ .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+ .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+ .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+ .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+ .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+ .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+ .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+ .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+ .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+ .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+ .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+ .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+ .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+ .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+ .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+ .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+ .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+ .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+ .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+ .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+ .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+ .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+ .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+ .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+ .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+ .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+ .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+ .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+ .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+ .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+ .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+ .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+ .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+ .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+ .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+ .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+ .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+ .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+ .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+ .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+ .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+ .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+ .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+ .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+ .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+ .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+ .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+ .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+ .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+ .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+ .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+ .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+ .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+ .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+ .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+ .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+ .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+ .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+ .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+ .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+ .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+ .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+ .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+ .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+ .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+ .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+ .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+ .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+ .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+ .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+ .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+ .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+ .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+ .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+ .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+ .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+ .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+ .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+ .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+ .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+ .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+ .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+ .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+ .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+ .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+ .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+ .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+ .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+ .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+ .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+ .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+ .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..87c46f6fe7
--- /dev/null
+++ b/rts/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl Alpha mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r17,8,r17
+ subq r31,r19,r7
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ srl r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,-8(r17)
+ subq r16,8,r16
+ subq r17,8,r17
+ subq r20,1,r20
+ sll r4,r19,r5
+ srl r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,-8(r17)
+ subq r16,32,r16
+ subq r18,4,r18
+ sll r4,r19,r5
+ srl r3,r7,r6
+
+ ldq r4,-16(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,24(r16)
+ srl r4,r7,r2
+
+ ldq r3,-24(r17)
+ sll r4,r19,r5
+ bis r1,r2,r8
+ stq r8,16(r16)
+ srl r3,r7,r6
+
+ ldq r4,-32(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,8(r16)
+ srl r4,r7,r2
+
+ subq r17,32,r17
+ bis r1,r2,r8
+ stq r8,0(r16)
+
+ bgt r18,$Loop
+
+$Lend: sll r4,r19,r8
+ stq r8,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..46b8df34f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r4 C clear cy_limb
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ stq r3,0(r16)
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r16,8,r16 C res_ptr++
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: stq r3,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..aa25eda54e
--- /dev/null
+++ b/rts/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl Alpha mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ addq r17,8,r17
+ subq r31,r19,r7
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ sll r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,0(r17)
+ addq r16,8,r16
+ addq r17,8,r17
+ subq r20,1,r20
+ srl r4,r19,r5
+ sll r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,0(r17)
+ addq r16,32,r16
+ subq r18,4,r18
+ srl r4,r19,r5
+ sll r3,r7,r6
+
+ ldq r4,8(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-32(r16)
+ sll r4,r7,r2
+
+ ldq r3,16(r17)
+ srl r4,r19,r5
+ bis r1,r2,r8
+ stq r8,-24(r16)
+ sll r3,r7,r6
+
+ ldq r4,24(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-16(r16)
+ sll r4,r7,r2
+
+ addq r17,32,r17
+ bis r1,r2,r8
+ stq r8,-8(r16)
+
+ bgt r18,$Loop
+
+$Lend: srl r4,r19,r8
+ stq r8,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..718f657141
--- /dev/null
+++ b/rts/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..caec1a720b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ subq r5,r3,r3
+ cmpult r5,r3,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S
new file mode 100644
index 0000000000..53814bbcb0
--- /dev/null
+++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S
@@ -0,0 +1,151 @@
+ # Alpha 21064 __udiv_qrnnd
+
+ # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __gmpn_udiv_qrnnd
+ .ent __gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd:
+ .frame $30,0,$26,0
+ .prologue 0
+#define cnt $2
+#define tmp $3
+#define rem_ptr $16
+#define n1 $17
+#define n0 $18
+#define d $19
+#define qb $20
+
+ ldiq cnt,16
+ blt d,.Largedivisor
+
+.Loop1: cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ subq cnt,1,cnt
+ bgt cnt,.Loop1
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+.Largedivisor:
+ and n0,1,$4
+
+ srl n0,1,n0
+ sll n1,63,tmp
+ or tmp,n0,n0
+ srl n1,1,n1
+
+ and d,1,$6
+ srl d,1,$5
+ addq $5,$6,$5
+
+.Loop2: cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ subq cnt,1,cnt
+ bgt cnt,.Loop2
+
+ addq n1,n1,n1
+ addq $4,n1,n1
+ bne $6,.LOdd
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+.LOdd:
+ /* q' in n0. r' in n1 */
+ addq n1,n0,n1
+ cmpult n1,n0,tmp # tmp := carry from addq
+ beq tmp,.LLp6
+ addq n0,1,n0
+ subq n1,d,n1
+.LLp6: cmpult n1,d,tmp
+ bne tmp,.LLp7
+ addq n0,1,n0
+ subq n1,d,n1
+.LLp7:
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+ .end __gmpn_udiv_qrnnd
diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..44428ed5f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,39 @@
+dnl Currently unused.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ .set noreorder
+ .set volatile
+ .set noat
+
+.text
+ .align 3
+ .globl __umul_ppmm
+ .ent __umul_ppmm
+__umul_ppmm:
+__umul_ppmm..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ mulq $17,$18,$1
+ umulh $17,$18,$0
+ stq $1,0($16)
+ ret $31,($26),1
+ .end __umul_ppmm
diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..7ff26c090c
--- /dev/null
+++ b/rts/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,63 @@
+divert(-1)
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+ `.ident dummy')
+
+define(`X',`^X$1')
+define(`FLOAT64',
+ `dnl
+ .psect $1@crud,data
+$1: .t_floating $2
+ .endp')
+
+define(`PROLOGUE',
+ `dnl
+ .stack 192 ; What does this mean? Only Cray knows.
+ .psect $1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+ `dnl
+ .endp')
+
+define(`DATASTART',
+ `dnl
+ .psect $1@crud,data
+$1:')
+define(`DATAEND',
+ `dnl
+ .endp')
+
+define(`ASM_END',
+ `dnl
+ .end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+define(`cvttqc',`cvttq/c')
+
+define(`ALIGN',`') ; Unicos assembler seems to align using garbage
+
+divert
+
diff --git a/rts/gmp/mpn/arm/add_n.S b/rts/gmp/mpn/arm/add_n.S
new file mode 100644
index 0000000000..fb3f8f703b
--- /dev/null
+++ b/rts/gmp/mpn/arm/add_n.S
@@ -0,0 +1,77 @@
+@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in
+@ a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define s r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+ .align 0
+ .global __gmpn_add_n
+ .type __gmpn_add_n,%function
+__gmpn_add_n:
+ stmfd sp!, { r8, r9, lr }
+ movs n, n, lsr #1
+ bcc skip1
+ ldr ip, [a], #4
+ ldr lr, [b], #4
+ adds ip, ip, lr
+ str ip, [s], #4
+skip1:
+ tst n, #1
+ beq skip2
+ ldmia a!, { r8, r9 }
+ ldmia b!, { ip, lr }
+ adcs r8, r8, ip
+ adcs r9, r9, lr
+ stmia s!, { r8, r9 }
+skip2:
+ bics n, n, #1
+ beq return
+ stmfd sp!, { r4, r5, r6, r7 }
+add_n_loop:
+ ldmia a!, { r4, r5, r6, r7 }
+ ldmia b!, { r8, r9, ip, lr }
+ adcs r4, r4, r8
+ ldr r8, [s] /* Bring stuff into cache. */
+ adcs r5, r5, r9
+ adcs r6, r6, ip
+ adcs r7, r7, lr
+ stmia s!, { r4, r5, r6, r7 }
+ sub n, n, #2
+ teq n, #0
+ bne add_n_loop
+ ldmfd sp!, { r4, r5, r6, r7 }
+return:
+ adc r0, n, #0
+ ldmfd sp!, { r8, r9, pc }
+end:
+ .size __gmpn_add_n, end - __gmpn_add_n
diff --git a/rts/gmp/mpn/arm/addmul_1.S b/rts/gmp/mpn/arm/addmul_1.S
new file mode 100644
index 0000000000..396fff77a3
--- /dev/null
+++ b/rts/gmp/mpn/arm/addmul_1.S
@@ -0,0 +1,89 @@
+@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a
+@ second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define z r11
+
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+ .align 0
+ .global __gmpn_addmul_1
+ .type __gmpn_addmul_1,%function
+__gmpn_addmul_1:
+ stmfd sp!, { r8-r11, lr }
+ mov z, #0
+ mov ip, #0
+ movs n, n, lsr #1
+ bcc skip1
+ ldr lr, [a], #4
+ ldr r9, [p]
+ umlal r9, ip, w, lr
+ str r9, [p], #4
+skip1:
+ movs n, n, lsr #1
+ bcc skip2
+ ldmia p, { r9, r10 }
+ adds r8, ip, r9
+ adc r9, z, #0
+ ldmia a!, { ip, lr }
+ umlal r8, r9, w, ip
+ adds r9, r9, r10
+ adc ip, z, #0
+ umlal r9, ip, w, lr
+ stmia p!, { r8, r9 }
+skip2:
+ teq n, #0
+ beq return
+ stmfd sp!, { r4-r7 }
+addmul_loop:
+ ldmia p, { r5, r6, r7, r8 }
+ adds r4, ip, r5
+ adc r5, z, #0
+ ldmia a!, { r9, r10, ip, lr }
+ umlal r4, r5, w, r9
+ adds r5, r5, r6
+ adc r6, z, #0
+ umlal r5, r6, w, r10
+ adds r6, r6, r7
+ adc r7, z, #0
+ umlal r6, r7, w, ip
+ adds r7, r7, r8
+ adc ip, z, #0
+ umlal r7, ip, w, lr
+ subs n, n, #1
+ stmia p!, { r4, r5, r6, r7 }
+ bne addmul_loop
+ ldmfd sp!, { r4-r7 }
+return:
+ mov r0, ip
+ ldmfd sp!, { r8-r11, pc }
+end:
+ .size __gmpn_addmul_1, end - __gmpn_addmul_1
diff --git a/rts/gmp/mpn/arm/gmp-mparam.h b/rts/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..a35b0c7b66
--- /dev/null
+++ b/rts/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,34 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 21
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 48
+#endif
diff --git a/rts/gmp/mpn/arm/mul_1.S b/rts/gmp/mpn/arm/mul_1.S
new file mode 100644
index 0000000000..bae526a0f0
--- /dev/null
+++ b/rts/gmp/mpn/arm/mul_1.S
@@ -0,0 +1,81 @@
+@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result
+@ in a second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+ .align 0
+ .global __gmpn_mul_1
+ .type __gmpn_mul_1,%function
+__gmpn_mul_1:
+ stmfd sp!, { r8, r9, lr }
+ ands ip, n, #1
+ beq skip1
+ ldr lr, [a], #4
+ umull r9, ip, w, lr
+ str r9, [p], #4
+skip1:
+ tst n, #2
+ beq skip2
+ mov r8, ip
+ ldmia a!, { ip, lr }
+ mov r9, #0
+ umlal r8, r9, w, ip
+ mov ip, #0
+ umlal r9, ip, w, lr
+ stmia p!, { r8, r9 }
+skip2:
+ bics n, n, #3
+ beq return
+ stmfd sp!, { r6, r7 }
+mul_1_loop:
+ mov r6, ip
+ ldmia a!, { r8, r9, ip, lr }
+ ldr r7, [p] /* Bring stuff into cache. */
+ mov r7, #0
+ umlal r6, r7, w, r8
+ mov r8, #0
+ umlal r7, r8, w, r9
+ mov r9, #0
+ umlal r8, r9, w, ip
+ mov ip, #0
+ umlal r9, ip, w, lr
+ subs n, n, #4
+ stmia p!, { r6, r7, r8, r9 }
+ bne mul_1_loop
+ ldmfd sp!, { r6, r7 }
+return:
+ mov r0, ip
+ ldmfd sp!, { r8, r9, pc }
+end:
+ .size __gmpn_mul_1, end - __gmpn_mul_1
diff --git a/rts/gmp/mpn/arm/sub_n.S b/rts/gmp/mpn/arm/sub_n.S
new file mode 100644
index 0000000000..856505fe21
--- /dev/null
+++ b/rts/gmp/mpn/arm/sub_n.S
@@ -0,0 +1,79 @@
+@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store
+@ difference in a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define d r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+ .align 0
+ .global __gmpn_sub_n
+ .type __gmpn_sub_n,%function
+__gmpn_sub_n:
+ stmfd sp!, { r8, r9, lr }
+ subs ip, ip, ip
+ tst n, #1
+ beq skip1
+ ldr ip, [a], #4
+ ldr lr, [b], #4
+ subs ip, ip, lr
+ str ip, [d], #4
+skip1:
+ tst n, #2
+ beq skip2
+ ldmia a!, { r8, r9 }
+ ldmia b!, { ip, lr }
+ sbcs r8, r8, ip
+ sbcs r9, r9, lr
+ stmia d!, { r8, r9 }
+skip2:
+ bics n, n, #3
+ beq return
+ stmfd sp!, { r4, r5, r6, r7 }
+sub_n_loop:
+ ldmia a!, { r4, r5, r6, r7 }
+ ldmia b!, { r8, r9, ip, lr }
+ sbcs r4, r4, r8
+ ldr r8, [d] /* Bring stuff into cache. */
+ sbcs r5, r5, r9
+ sbcs r6, r6, ip
+ sbcs r7, r7, lr
+ stmia d!, { r4, r5, r6, r7 }
+ sub n, n, #4
+ teq n, #0
+ bne sub_n_loop
+ ldmfd sp!, { r4, r5, r6, r7 }
+return:
+ sbc r0, r0, r0
+ and r0, r0, #1
+ ldmfd sp!, { r8, r9, pc }
+end:
+ .size __gmpn_sub_n, end - __gmpn_sub_n
diff --git a/rts/gmp/mpn/asm-defs.m4 b/rts/gmp/mpn/asm-defs.m4
new file mode 100644
index 0000000000..aa2024138b
--- /dev/null
+++ b/rts/gmp/mpn/asm-defs.m4
@@ -0,0 +1,1182 @@
+divert(-1)
+dnl
+dnl m4 macros for gmp assembly code, shared by all CPUs.
+dnl
+dnl These macros are designed for use with any m4 and have been used on
+dnl GNU, FreeBSD, OpenBSD and SysV.
+dnl
+dnl GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error
+dnl messages.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Macros:
+dnl
+dnl Most new m4 specific macros have an "m4_" prefix to emphasise they're
+dnl m4 expansions. But new defining things like deflit() and defreg() are
+dnl named like the builtin define(), and forloop() is named following the
+dnl GNU m4 example on which it's based.
+dnl
+dnl GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that
+dnl option isn't going to be used, so there's no conflict or confusion.
+dnl
+dnl
+dnl Comments in output:
+dnl
+dnl The m4 comment delimiters are left at # and \n, the normal assembler
+dnl commenting for most CPUs. m4 passes comment text through without
+dnl expanding macros in it, which is generally a good thing since it stops
+dnl unexpected expansions and possible resultant errors.
+dnl
+dnl But note that when a quoted string is being read, a # isn't special, so
+dnl apostrophes in comments in quoted strings must be avoided or they'll be
+dnl interpreted as a closing quote mark. But when the quoted text is
+dnl re-read # will still act like a normal comment, supressing macro
+dnl expansion.
+dnl
+dnl For example,
+dnl
+dnl # apostrophes in comments that're outside quotes are ok
+dnl # and using macro names like PROLOGUE is ok too
+dnl ...
+dnl ifdef(`PIC',`
+dnl # but apostrophes aren't ok inside quotes
+dnl # ^--wrong
+dnl ...
+dnl # though macro names like PROLOGUE are still ok
+dnl ...
+dnl ')
+dnl
+dnl If macro expansion in a comment is wanted, use `#' in the .asm (ie. a
+dnl quoted hash symbol), which will turn into # in the .s but get
+dnl expansions done on that line. This can make the .s more readable to
+dnl humans, but it won't make a blind bit of difference to the assembler.
+dnl
+dnl All the above applies, mutatis mutandis, when changecom() is used to
+dnl select @ ! ; or whatever other commenting.
+dnl
+dnl
+dnl Variations in m4 affecting gmp:
+dnl
+dnl $# - When a macro is called as "foo" with no brackets, BSD m4 sets $#
+dnl to 1, whereas GNU or SysV m4 set it to 0. In all cases though
+dnl "foo()" sets $# to 1. This is worked around in various places.
+dnl
+dnl len() - When "len()" is given an empty argument, BSD m4 evaluates to
+dnl nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0.
+dnl See m4_length() below which works around this.
+dnl
+dnl translit() - GNU m4 accepts character ranges like A-Z, and the new
+dnl OpenBSD m4 does under option -g, but basic BSD and SysV don't.
+dnl
+dnl popdef() - in BSD and SysV m4 popdef() takes multiple arguments and
+dnl pops each, but GNU m4 only takes one argument.
+dnl
+dnl push back - BSD m4 has some limits on the amount of text that can be
+dnl pushed back. The limit is reasonably big and so long as macros
+dnl don't gratuitously duplicate big arguments it isn't a problem.
+dnl Normally an error message is given, but sometimes it just hangs.
+dnl
+dnl eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available,
+dnl but BSD m4 doesn't (contrary to what the man page suggests) and
+dnl instead ^ is exponentiation.
+dnl
+dnl eval() ?: - The C ternary operator "?:" is available in BSD m4, but not
+dnl in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5).
+dnl
+dnl eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31
+dnl (ie. -2147483648) gives "-(". Using -2147483648 within an
+dnl expression is ok, it just can't be a final result. "-(" will of
+dnl course upset parsing, with all sorts of strange effects.
+dnl
+dnl eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on
+dnl SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't). See
+dnl m4_lshift() and m4_rshift() below for workarounds.
+dnl
+dnl m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string,
+dnl in SysV m4 it appends to it, and in GNU m4 it prepends. See
+dnl m4wrap_prepend() below which brings uniformity to this.
+dnl
+dnl __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and
+dnl they're used here to make error messages more informative. GNU m4
+dnl gives an unhelpful "NONE 0" in an m4wrap(), but that's worked
+dnl around.
+dnl
+dnl __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the
+dnl filename in __file__, so care should be taken that no macro has
+dnl the same name as a file, or an unwanted expansion will occur when
+dnl printing an error or warning.
+dnl
+dnl OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9
+dnl in eval(), making it pretty much unusable. This bug is confined
+dnl to version 2.6 (it's not in 2.5, and has been fixed in 2.7).
+dnl
+dnl SunOS /usr/bin/m4 - this m4 lacks a number of desired features,
+dnl including $# and $@, defn(), m4exit(), m4wrap(), pushdef(),
+dnl popdef(). /usr/5bin/m4 is a SysV style m4 which should always be
+dnl available, and "configure" will reject /usr/bin/m4 in favour of
+dnl /usr/5bin/m4 (if necessary).
+dnl
+dnl The sparc code actually has modest m4 requirements currently and
+dnl could manage with /usr/bin/m4, but there's no reason to put our
+dnl macros through contortions when /usr/5bin/m4 is available or GNU
+dnl m4 can be installed.
+
+
+ifdef(`__ASM_DEFS_M4_INCLUDED__',
+`m4_error(`asm-defs.m4 already included, dont include it twice
+')m4exit(1)')
+define(`__ASM_DEFS_M4_INCLUDED__')
+
+
+dnl Detect and give a message about the unsuitable OpenBSD 2.6 m4.
+
+ifelse(eval(89),89,,
+`errprint(
+`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable.
+This is probably OpenBSD 2.6 m4 (September 1999). Upgrade to OpenBSD 2.7,
+or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4. Dont forget
+to configure with M4=/wherever/m4 if you install one of these in a directory
+not in $PATH.
+')m4exit(1)')
+
+
+dnl Detect and give a message about the unsuitable SunOS /usr/bin/m4.
+dnl
+dnl Unfortunately this test doesn't work when m4 is run in the normal way
+dnl from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4
+dnl takes "-" in "-D..." to mean read stdin, so it will look like it just
+dnl hangs. But running "m4 asm-defs.m4" to try it out will work.
+dnl
+dnl We'd like to abort immediately on finding a problem, but unfortunately
+dnl the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill
+dnl it. Unexpanded $#'s in some m4_assert_numargs() later on will comment
+dnl out some closing parentheses and kill it with "m4: arg stack overflow".
+
+define(m4_dollarhash_works_test,``$#'')
+ifelse(m4_dollarhash_works_test(x),1,,
+`errprint(
+`This m4 doesnt support $# and cant be used for GMP asm processing.
+If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that
+or can get it, otherwise install GNU m4. Dont forget to configure with
+M4=/wherever/m4 if you install in a directory not in $PATH.
+')')
+undefine(`m4_dollarhash_works_test')
+
+
+dnl --------------------------------------------------------------------------
+dnl Basic error handling things.
+
+
+dnl Usage: m4_dollarhash_1_if_noparen_p
+dnl
+dnl Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU
+dnl and SysV m4 give).
+
+define(m4_dollarhash_1_if_noparen_test,`$#')
+define(m4_dollarhash_1_if_noparen_p,
+eval(m4_dollarhash_1_if_noparen_test==1))
+undefine(`m4_dollarhash_1_if_noparen_test')
+
+
+dnl Usage: m4wrap_prepend(string)
+dnl
+dnl Prepend the given string to what will be exapanded under m4wrap at the
+dnl end of input.
+dnl
+dnl This macro exists to work around variations in m4wrap() behaviour in
+dnl the various m4s (notes at the start of this file). Don't use m4wrap()
+dnl directly since it will interfere with this scheme.
+
+define(m4wrap_prepend,
+m4_assert_numargs(1)
+`define(`m4wrap_string',`$1'defn(`m4wrap_string'))')
+
+m4wrap(`m4wrap_string')
+define(m4wrap_string,`')
+
+
+dnl Usage: m4_file_and_line
+dnl
+dnl Expand to the current file and line number, if the GNU m4 extensions
+dnl __file__ and __line__ are available.
+dnl
+dnl In GNU m4 1.4 at the end of input when m4wrap text is expanded,
+dnl __file__ is NONE and __line__ is 0, which is not a helpful thing to
+dnl print. If m4_file_seen() has been called to note the last file seen,
+dnl then that file at a big line number is used, otherwise "end of input"
+dnl is used (although "end of input" won't parse as an error message).
+
+define(m4_file_and_line,
+`ifdef(`__file__',
+`ifelse(__file__`'__line__,`NONE0',
+`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')',
+`__file__: __line__: ')')')
+
+
+dnl Usage: m4_errprint_commas(arg,...)
+dnl
+dnl The same as errprint(), but commas are printed between arguments
+dnl instead of spaces.
+
+define(m4_errprint_commas,
+`errprint(`$1')dnl
+ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')')
+
+
+dnl Usage: m4_error(args...)
+dnl m4_warning(args...)
+dnl
+dnl Print an error message, using m4_errprint_commas, prefixed with the
+dnl current filename and line number (if available). m4_error sets up to
+dnl give an error exit at the end of processing, m4_warning just prints.
+dnl These macros are the recommended way to print errors.
+dnl
+dnl The arguments here should be quoted in the usual way to prevent them
+dnl being expanded when the macro call is read. (m4_error takes care not
+dnl to do any further expansion.)
+dnl
+dnl For example,
+dnl
+dnl m4_error(`some error message
+dnl ')
+dnl
+dnl which prints
+dnl
+dnl foo.asm:123: some error message
+dnl
+dnl or if __file__ and __line__ aren't available
+dnl
+dnl some error message
+dnl
+dnl The "file:line:" format is a basic style, used by gcc and GNU m4, so
+dnl emacs and other editors will recognise it in their normal error message
+dnl parsing.
+
+define(m4_warning,
+`m4_errprint_commas(m4_file_and_line`'$@)')
+
+define(m4_error,
+`define(`m4_error_occurred',1)m4_warning($@)')
+
+define(`m4_error_occurred',0)
+
+dnl This m4wrap_prepend() is first, so it'll be executed last.
+m4wrap_prepend(
+`ifelse(m4_error_occurred,1,
+`m4_error(`Errors occurred during m4 processing
+')m4exit(1)')')
+
+
+dnl Usage: m4_assert_numargs(num)
+dnl
+dnl Put this unquoted on a line on its own at the start of a macro
+dnl definition to add some code to check that num many arguments get passed
+dnl to the macro. For example,
+dnl
+dnl define(foo,
+dnl m4_assert_numargs(2)
+dnl `something `$1' and `$2' blah blah')
+dnl
+dnl Then a call like foo(one,two,three) will provoke an error like
+dnl
+dnl file:10: foo expected 2 arguments, got 3 arguments
+dnl
+dnl Here are some calls and how many arguments they're interpreted as passing.
+dnl
+dnl foo(abc,def) 2
+dnl foo(xyz) 1
+dnl foo() 0
+dnl foo -1
+dnl
+dnl The -1 for no parentheses at all means a macro that's meant to be used
+dnl that way can be checked with m4_assert_numargs(-1). For example,
+dnl
+dnl define(SPECIAL_SUFFIX,
+dnl m4_assert_numargs(-1)
+dnl `ifdef(`FOO',`_foo',`_bar')')
+dnl
+dnl But as an alternative see also deflit() below where parenthesized
+dnl expressions following a macro are passed through to the output.
+dnl
+dnl Note that in BSD m4 there's no way to differentiate calls "foo" and
+dnl "foo()", so in BSD m4 the distinction between the two isn't enforced.
+dnl (In GNU and SysV m4 it can be checked, and is.)
+
+
+dnl m4_assert_numargs is able to check its own arguments by calling
+dnl assert_numargs_internal directly.
+dnl
+dnl m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand
+dnl to `$`'0' and do the wrong thing, and likewise for $1. The same is
+dnl done in other assert macros.
+dnl
+dnl $`#' leaves $# in the new macro being defined, and stops # being
+dnl interpreted as a comment character.
+dnl
+dnl `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is
+dnl used. The space means that if there is a dnl it'll still work.
+
+dnl Usage: m4_doublequote(x) expands to ``x''
+define(m4_doublequote,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''')
+
+define(m4_assert_numargs,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl
+`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1'))
+define(m4_assert_numargs_internal,
+`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))')
+
+dnl Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs)
+dnl
+dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl should be -1. If wantargs is -1 but gotargs is 0 and the two can't be
+dnl distinguished then it's allowed to pass.
+dnl
+define(m4_assert_numargs_internal_check,
+`ifelse(eval($2 == $3
+ || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0,
+`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3')
+)')')
+
+dnl Called: m4_numargs_count($#,len(`$1'))
+dnl If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise
+dnl $# args.
+define(m4_numargs_count,
+`ifelse($1,0, -1,
+`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')')
+
+dnl Usage: m4_Narguments(N)
+dnl "$1 argument" or "$1 arguments" with the plural according to $1.
+define(m4_Narguments,
+`$1 argument`'ifelse(`$1',1,,s)')
+
+
+dnl --------------------------------------------------------------------------
+dnl Additional error checking things.
+
+
+dnl Usage: m4_file_seen()
+dnl
+dnl Record __file__ for the benefit of m4_file_and_line in m4wrap text.
+dnl The basic __file__ macro comes out quoted, like `foo.asm', and
+dnl m4_file_seen_last is defined like that too.
+dnl
+dnl This only needs to be used with something that could generate an error
+dnl message in m4wrap text. The x86 PROLOGUE is the only such at the
+dnl moment (at end of input its m4wrap checks for missing EPILOGUE). A few
+dnl include()s can easily trick this scheme, but you'd expect an EPILOGUE
+dnl in the same file as the PROLOGUE.
+
+define(m4_file_seen,
+m4_assert_numargs(0)
+`ifelse(__file__,`NONE',,
+`define(`m4_file_seen_last',m4_doublequote(__file__))')')
+
+
+dnl Usage: m4_assert_onearg()
+dnl
+dnl Put this, unquoted, at the start of a macro definition to add some code
+dnl to check that one argument is passed to the macro, but with that
+dnl argument allowed to be empty. For example,
+dnl
+dnl define(foo,
+dnl m4_assert_onearg()
+dnl `blah blah $1 blah blah')
+dnl
+dnl Calls "foo(xyz)" or "foo()" are accepted. A call "foo(xyz,abc)" fails.
+dnl A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV
+dnl m4 can).
+
+define(m4_assert_onearg,
+m4_assert_numargs(0)
+`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ')
+
+dnl Called: m4_assert_onearg(`macroname',$#)
+define(m4_assert_onearg_internal,
+`ifelse($2,1,,
+`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2')
+)')')
+
+
+dnl Usage: m4_assert_numargs_range(low,high)
+dnl
+dnl Put this, unquoted, at the start of a macro definition to add some code
+dnl to check that between low and high many arguments get passed to the
+dnl macro. For example,
+dnl
+dnl define(foo,
+dnl m4_assert_numargs_range(3,5)
+dnl `mandatory $1 $2 $3 optional $4 $5 end')
+dnl
+dnl See m4_assert_numargs() for more info.
+
+define(m4_assert_numargs_range,
+m4_assert_numargs(2)
+``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1'))
+define(m4_assert_numargs_range_internal,
+m4_assert_numargs(5)
+`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))')
+
+dnl Called: m4_assert_numargs_range_check(`name',low,high,gotargs)
+dnl
+dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl should be -1. To ensure a `high' of -1 works, a fudge is applied to
+dnl gotargs if it's 0 and the 0 and -1 cases can't be distinguished.
+dnl
+define(m4_assert_numargs_range_check,
+m4_assert_numargs(4)
+`ifelse(eval($2 <= $4 &&
+ ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0,
+`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4')
+)')')
+
+
+dnl Usage: m4_assert_defined(symbol)
+dnl
+dnl Put this unquoted on a line of its own at the start of a macro
+dnl definition to add some code to check that the given symbol is defined
+dnl when the macro is used. For example,
+dnl
+dnl define(foo,
+dnl m4_assert_defined(`FOO_PREFIX')
+dnl `FOO_PREFIX whatever')
+dnl
+dnl This is a convenient way to check that the user or ./configure or
+dnl whatever has defined the things needed by a macro, as opposed to
+dnl silently generating garbage.
+
+define(m4_assert_defined,
+m4_assert_numargs(1)
+``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '')
+
+dnl Called: m4_assert_defined_internal(`macroname',`define_required')
+define(m4_assert_defined_internal,
+m4_assert_numargs(2)
+`ifdef(`$2',,
+`m4_error(`$1 needs $2 defined
+')')')
+
+
+dnl Usage: m4_not_for_expansion(`SYMBOL')
+dnl define_not_for_expansion(`SYMBOL')
+dnl
+dnl m4_not_for_expansion turns SYMBOL, if defined, into something which
+dnl will give an error if expanded. For example,
+dnl
+dnl m4_not_for_expansion(`PIC')
+dnl
+dnl define_not_for_expansion is the same, but always makes a definition.
+dnl
+dnl These are for symbols that should be tested with ifdef(`FOO',...)
+dnl rather than be expanded as such. They guard against accidentally
+dnl omitting the quotes, as in ifdef(FOO,...). Note though that they only
+dnl catches this when FOO is defined, so be sure to test code both with and
+dnl without each definition.
+
+define(m4_not_for_expansion,
+m4_assert_numargs(1)
+`ifdef(`$1',`define_not_for_expansion(`$1')')')
+
+define(define_not_for_expansion,
+m4_assert_numargs(1)
+`ifelse(defn(`$1'),,,
+`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion()
+')')dnl
+define(`$1',`m4_not_for_expansion_internal(`$1')')')
+
+define(m4_not_for_expansion_internal,
+`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)'
+')')
+
+
+dnl --------------------------------------------------------------------------
+dnl Various generic m4 things.
+
+
+dnl Usage: m4_ifdef_anyof_p(`symbol',...)
+dnl
+dnl Expand to 1 if any of the symbols in the argument list are defined, or
+dnl to 0 if not.
+
+define(m4_ifdef_anyof_p,
+`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0,
+`ifdef(`$1', 1,
+`m4_ifdef_anyof_p(shift($@))')')')
+
+
+dnl Usage: m4_length(string)
+dnl
+dnl Determine the length of a string. This is the same as len(), but
+dnl always expands to a number, working around the BSD len() which
+dnl evaluates to nothing given an empty argument.
+
+define(m4_length,
+m4_assert_onearg()
+`eval(len(`$1')-0)')
+
+
+dnl Usage: m4_stringequal_p(x,y)
+dnl
+dnl Expand to 1 or 0 according as strings x and y are equal or not.
+
+define(m4_stringequal_p,
+`ifelse(`$1',`$2',1,0)')
+
+
+dnl Usage: m4_incr_or_decr(n,last)
+dnl
+dnl Do an incr(n) or decr(n), whichever is in the direction of "last".
+dnl Both n and last must be numbers of course.
+
+define(m4_incr_or_decr,
+m4_assert_numargs(2)
+`ifelse(eval($1<$2),1,incr($1),decr($1))')
+
+
+dnl Usage: forloop(i, first, last, statement)
+dnl
+dnl Based on GNU m4 examples/forloop.m4, but extended.
+dnl
+dnl statement is expanded repeatedly, with i successively defined as
+dnl
+dnl first, first+1, ..., last-1, last
+dnl
+dnl Or if first > last, then it's
+dnl
+dnl first, first-1, ..., last+1, last
+dnl
+dnl If first == last, then one expansion is done.
+dnl
+dnl A pushdef/popdef of i is done to preserve any previous definition (or
+dnl lack of definition). first and last are eval()ed and so can be
+dnl expressions.
+dnl
+dnl forloop_first is defined to 1 on the first iteration, 0 on the rest.
+dnl forloop_last is defined to 1 on the last iteration, 0 on the others.
+dnl Nested forloops are allowed, in which case forloop_first and
+dnl forloop_last apply to the innermost loop that's open.
+dnl
+dnl A simple example,
+dnl
+dnl forloop(i, 1, 2*2+1, `dnl
+dnl iteration number i ... ifelse(forloop_first,1,FIRST)
+dnl ')
+
+
+dnl "i" and "statement" are carefully quoted, but "first" and "last" are
+dnl just plain numbers once eval()ed.
+
+define(`forloop',
+m4_assert_numargs(4)
+`pushdef(`$1',eval(`$2'))dnl
+pushdef(`forloop_first',1)dnl
+pushdef(`forloop_last',0)dnl
+forloop_internal(`$1',eval(`$3'),`$4')`'dnl
+popdef(`forloop_first')dnl
+popdef(`forloop_last')dnl
+popdef(`$1')')
+
+dnl Called: forloop_internal(`var',last,statement)
+define(`forloop_internal',
+m4_assert_numargs(3)
+`ifelse($1,$2,
+`define(`forloop_last',1)$3',
+`$3`'dnl
+define(`forloop_first',0)dnl
+define(`$1',m4_incr_or_decr($1,$2))dnl
+forloop_internal(`$1',$2,`$3')')')
+
+
+dnl Usage: m4_toupper(x)
+dnl m4_tolower(x)
+dnl
+dnl Convert the argument string to upper or lower case, respectively.
+dnl Only one argument accepted.
+dnl
+dnl BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet
+dnl is written out.
+
+define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz')
+define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+define(m4_toupper,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)')
+
+define(m4_tolower,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)')
+
+
+dnl Usage: m4_empty_if_zero(x)
+dnl
+dnl Evaluate to x, or to nothing if x is 0. x is eval()ed and so can be an
+dnl expression.
+dnl
+dnl This is useful for x86 addressing mode displacements since forms like
+dnl (%ebx) are one byte shorter than 0(%ebx). A macro `foo' for use as
+dnl foo(%ebx) could be defined with the following so it'll be empty if the
+dnl expression comes out zero.
+dnl
+dnl deflit(`foo', `m4_empty_if_zero(a+b*4-c)')
+dnl
+dnl Naturally this shouldn't be done if, say, a computed jump depends on
+dnl the code being a particular size.
+
+define(m4_empty_if_zero,
+m4_assert_onearg()
+`ifelse(eval($1),0,,eval($1))')
+
+
+dnl Usage: m4_log2(x)
+dnl
+dnl Calculate a logarithm to base 2.
+dnl x must be an integral power of 2, between 2**0 and 2**30.
+dnl x is eval()ed, so it can be an expression.
+dnl An error results if x is invalid.
+dnl
+dnl 2**31 isn't supported, because an unsigned 2147483648 is out of range
+dnl of a 32-bit signed int. Also, the bug in BSD m4 where an eval()
+dnl resulting in 2147483648 (or -2147483648 as the case may be) gives `-('
+dnl means tests like eval(1<<31==(x)) would be necessary, but that then
+dnl gives an unattractive explosion of eval() error messages if x isn't
+dnl numeric.
+
+define(m4_log2,
+m4_assert_numargs(1)
+`m4_log2_internal(0,1,eval(`$1'))')
+
+dnl Called: m4_log2_internal(n,2**n,target)
+define(m4_log2_internal,
+m4_assert_numargs(3)
+`ifelse($2,$3,$1,
+`ifelse($1,30,
+`m4_error(`m4_log2() argument too big or not a power of two: $3
+')',
+`m4_log2_internal(incr($1),eval(2*$2),$3)')')')
+
+
+dnl Usage: m4_div2_towards_zero
+dnl
+dnl m4 division is probably whatever a C signed division is, and C doesn't
+dnl specify what rounding gets used on negatives, so this expression forces
+dnl a rounding towards zero.
+
+define(m4_div2_towards_zero,
+m4_assert_numargs(1)
+`eval((($1) + ((($1)<0) & ($1))) / 2)')
+
+
+dnl Usage: m4_lshift(n,count)
+dnl m4_rshift(n,count)
+dnl
+dnl Calculate n shifted left or right by count many bits. Both n and count
+dnl are eval()ed and so can be expressions.
+dnl
+dnl Negative counts are allowed and mean a shift in the opposite direction.
+dnl Negative n is allowed and right shifts will be arithmetic (meaning
+dnl divide by 2**count, rounding towards zero, also meaning the sign bit is
+dnl duplicated).
+dnl
+dnl Use these macros instead of << and >> in eval() since the basic ccs
+dnl SysV m4 doesn't have those operators.
+
+define(m4_rshift,
+m4_assert_numargs(2)
+`m4_lshift(`$1',-(`$2'))')
+
+define(m4_lshift,
+m4_assert_numargs(2)
+`m4_lshift_internal(eval(`$1'),eval(`$2'))')
+
+define(m4_lshift_internal,
+m4_assert_numargs(2)
+`ifelse(eval($2-0==0),1,$1,
+`ifelse(eval($2>0),1,
+`m4_lshift_internal(eval($1*2),decr($2))',
+`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')')
+
+
+dnl Usage: deflit(name,value)
+dnl
+dnl Like define(), but "name" expands like a literal, rather than taking
+dnl arguments. For example "name(%eax)" expands to "value(%eax)".
+dnl
+dnl Limitations:
+dnl
+dnl $ characters in the value part must have quotes to stop them looking
+dnl like macro parameters. For example, deflit(reg,`123+$`'4+567'). See
+dnl defreg() below for handling simple register definitions like $7 etc.
+dnl
+dnl "name()" is turned into "name", unfortunately. In GNU and SysV m4 an
+dnl error is generated when this happens, but in BSD m4 it will happen
+dnl silently. The problem is that in BSD m4 $# is 1 in both "name" or
+dnl "name()", so there's no way to differentiate them. Because we want
+dnl plain "name" to turn into plain "value", we end up with "name()"
+dnl turning into plain "value" too.
+dnl
+dnl "name(foo)" will lose any whitespace after commas in "foo", for example
+dnl "disp(%eax, %ecx)" would become "128(%eax,%ecx)".
+dnl
+dnl These parentheses oddities shouldn't matter in assembler text, but if
+dnl they do the suggested workaround is to write "name ()" or "name (foo)"
+dnl to stop the parentheses looking like a macro argument list. If a space
+dnl isn't acceptable in the output, then write "name`'()" or "name`'(foo)".
+dnl The `' is stripped when read, but again stops the parentheses looking
+dnl like parameters.
+
+dnl Quoting for deflit_emptyargcheck is similar to m4_assert_numargs. The
+dnl stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro
+dnl created, not in deflit.
+define(deflit,
+m4_assert_numargs(2)
+`define(`$1',
+`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl
+$2`'dnl
+ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
+
+dnl Called: deflit_emptyargcheck(macroname,$#,`$1')
+define(deflit_emptyargcheck,
+`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information)
+')')')
+
+
+dnl Usage: m4_assert(`expr')
+dnl
+dnl Test a compile-time requirement with an m4 expression. The expression
+dnl should be quoted, and will be eval()ed and expected to be non-zero.
+dnl For example,
+dnl
+dnl m4_assert(`FOO*2+6 < 14')
+
+define(m4_assert,
+m4_assert_numargs(1)
+`ifelse(eval($1),1,,
+`m4_error(`assertion failed: $1
+')')')
+
+
+dnl --------------------------------------------------------------------------
+dnl Various assembler things, not specific to any particular CPU.
+dnl
+
+
+dnl Usage: include_mpn(`filename')
+dnl
+dnl Like include(), but adds a path to the mpn source directory. For
+dnl example,
+dnl
+dnl include_mpn(`sparc64/addmul_1h.asm')
+
+define(include_mpn,
+m4_assert_numargs(1)
+m4_assert_defined(`CONFIG_TOP_SRCDIR')
+`include(CONFIG_TOP_SRCDIR`/mpn/$1')')
+
+
+dnl Usage: C comment ...
+dnl
+dnl "C" works like a FORTRAN-style comment character. This can be used for
+dnl comments to the right of assembly instructions, where just dnl would
+dnl remove the linefeed, and concatenate adjacent lines.
+dnl
+dnl "C" and/or "dnl" are useful when an assembler doesn't support comments,
+dnl or where different assemblers for a particular CPU have different
+dnl comment styles. The intermediate ".s" files will end up with no
+dnl comments, just code.
+dnl
+dnl Using "C" is not intended to cause offence to anyone who doesn't like
+dnl FORTRAN; but if that happens it's an unexpected bonus.
+
+define(C, `
+dnl')
+
+
+dnl Various possible defines passed from the Makefile that are to be tested
+dnl with ifdef() rather than be expanded.
+
+m4_not_for_expansion(`PIC')
+
+dnl aors_n
+m4_not_for_expansion(`OPERATION_add_n')
+m4_not_for_expansion(`OPERATION_sub_n')
+
+dnl aorsmul_n
+m4_not_for_expansion(`OPERATION_addmul_1')
+m4_not_for_expansion(`OPERATION_submul_1')
+
+dnl logops_n
+m4_not_for_expansion(`OPERATION_and_n')
+m4_not_for_expansion(`OPERATION_andn_n')
+m4_not_for_expansion(`OPERATION_nand_n')
+m4_not_for_expansion(`OPERATION_ior_n')
+m4_not_for_expansion(`OPERATION_iorn_n')
+m4_not_for_expansion(`OPERATION_nior_n')
+m4_not_for_expansion(`OPERATION_xor_n')
+m4_not_for_expansion(`OPERATION_xnor_n')
+
+dnl popham
+m4_not_for_expansion(`OPERATION_popcount')
+m4_not_for_expansion(`OPERATION_hamdist')
+
+
+dnl Usage: m4_config_gmp_mparam(`symbol')
+dnl
+dnl Check that `symbol' is defined. If it isn't, issue an error and
+dnl terminate immediately. The error message explains that the symbol
+dnl should be in config.m4, copied from gmp-mparam.h.
+dnl
+dnl Processing is terminated immediately since missing something like
+dnl KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error
+dnl messages.
+
+define(m4_config_gmp_mparam,
+m4_assert_numargs(1)
+`ifdef(`$1',,
+`m4_error(`$1 is not defined.
+ "configure" should have extracted this from gmp-mparam.h and put it
+ in config.m4, but somehow this has failed.
+')m4exit(1)')')
+
+
+dnl Usage: defreg(name,reg)
+dnl
+dnl Give a name to a $ style register. For example,
+dnl
+dnl defreg(foo,$12)
+dnl
+dnl defreg() inserts an extra pair of quotes after the $ so that it's not
+dnl interpreted as an m4 macro parameter, ie. foo is actually $`'12. m4
+dnl strips those quotes when foo is expanded.
+dnl
+dnl deflit() is used to make the new definition, so it will expand
+dnl literally even if followed by parentheses ie. foo(99) will become
+dnl $12(99). (But there's nowhere that would be used is there?)
+dnl
+dnl When making further definitions from existing defreg() macros, remember
+dnl to use defreg() again to protect the $ in the new definitions too. For
+dnl example,
+dnl
+dnl defreg(a0,$4)
+dnl defreg(a1,$5)
+dnl ...
+dnl
+dnl defreg(PARAM_DST,a0)
+dnl
+dnl This is only because a0 is expanding at the time the PARAM_DST
+dnl definition is made, leaving a literal $4 that must be re-quoted. On
+dnl the other hand in something like the following ra is only expanded when
+dnl ret is used and its $`'31 protection will have its desired effect at
+dnl that time.
+dnl
+dnl defreg(ra,$31)
+dnl ...
+dnl define(ret,`j ra')
+dnl
+dnl Note that only $n forms are meant to be used here, and something like
+dnl 128($30) doesn't get protected and will come out wrong.
+
+define(defreg,
+m4_assert_numargs(2)
+`deflit(`$1',
+substr(`$2',0,1)``''substr(`$2',1))')
+
+
+dnl Usage: m4_instruction_wrapper(num)
+dnl
+dnl Put this, unquoted, on a line on its own, at the start of a macro
+dnl that's a wrapper around an assembler instruction. It adds code to give
+dnl a descriptive error message if the macro is invoked without arguments.
+dnl
+dnl For example, suppose jmp needs to be wrapped,
+dnl
+dnl define(jmp,
+dnl m4_instruction_wrapper()
+dnl m4_assert_numargs(1)
+dnl `.byte 0x42
+dnl .long $1
+dnl nop')
+dnl
+dnl The point of m4_instruction_wrapper is to get a better error message
+dnl than m4_assert_numargs would give if jmp is accidentally used as plain
+dnl "jmp foo" instead of the intended "jmp( foo)". "jmp()" with no
+dnl argument also provokes the error message.
+dnl
+dnl m4_instruction_wrapper should only be used with wrapped instructions
+dnl that take arguments, since obviously something meant to be used as
+dnl plain "ret", say, doesn't want to give an error when used that way.
+
+define(m4_instruction_wrapper,
+m4_assert_numargs(0)
+``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl
+m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl
+$`#',m4_doublequote($`'1))`dnl'')
+
+dnl Called: m4_instruction_wrapper_internal($0,`filename',$#,$1)
+define(m4_instruction_wrapper_internal,
+`ifelse(eval($3<=1 && m4_length(`$4')==0),1,
+`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details
+')')')
+
+
+dnl Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES
+dnl CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES
+dnl
+dnl When code supports a variable amount of loop unrolling, the convention
+dnl is to define UNROLL_COUNT to the number of limbs processed per loop.
+dnl When testing code this can be varied to see how much the loop overhead
+dnl is costing. For example,
+dnl
+dnl deflit(UNROLL_COUNT, 32)
+dnl
+dnl If the forloop() generating the unrolled loop has a pattern processing
+dnl more than one limb, the convention is to express this with CHUNK_COUNT.
+dnl For example,
+dnl
+dnl deflit(CHUNK_COUNT, 2)
+dnl
+dnl The LOG2, MASK and BYTES definitions below are derived from these COUNT
+dnl definitions. If COUNT is redefined, the LOG2, MASK and BYTES follow
+dnl the new definition automatically.
+dnl
+dnl LOG2 is the log base 2 of COUNT. MASK is COUNT-1, which can be used as
+dnl a bit mask. BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes
+dnl processed in each unrolled loop.
+dnl
+dnl BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file. It
+dnl exists only so the BYTES definitions here can be common to all CPUs.
+dnl In the actual code for a given CPU, an explicit 4 or 8 may as well be
+dnl used because the code is only for a particular CPU, it doesn't need to
+dnl be general.
+dnl
+dnl Note that none of these macros do anything except give conventional
+dnl names to commonly used things. You still have to write your own
+dnl expressions for a forloop() and the resulting address displacements.
+dnl Something like the following would be typical for 4 bytes per limb.
+dnl
+dnl forloop(`i',0,UNROLL_COUNT-1,`
+dnl deflit(`disp',eval(i*4))
+dnl ...
+dnl ')
+dnl
+dnl Or when using CHUNK_COUNT,
+dnl
+dnl forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,`
+dnl deflit(`disp0',eval(i*CHUNK_COUNT*4))
+dnl deflit(`disp1',eval(disp0+4))
+dnl ...
+dnl ')
+dnl
+dnl Clearly `i' can be run starting from 1, or from high to low or whatever
+dnl best suits.
+
+deflit(UNROLL_LOG2,
+m4_assert_defined(`UNROLL_COUNT')
+`m4_log2(UNROLL_COUNT)')
+
+deflit(UNROLL_MASK,
+m4_assert_defined(`UNROLL_COUNT')
+`eval(UNROLL_COUNT-1)')
+
+deflit(UNROLL_BYTES,
+m4_assert_defined(`UNROLL_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)')
+
+deflit(CHUNK_LOG2,
+m4_assert_defined(`CHUNK_COUNT')
+`m4_log2(CHUNK_COUNT)')
+
+deflit(CHUNK_MASK,
+m4_assert_defined(`CHUNK_COUNT')
+`eval(CHUNK_COUNT-1)')
+
+deflit(CHUNK_BYTES,
+m4_assert_defined(`CHUNK_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)')
+
+
+dnl Usage: MPN(name)
+dnl
+dnl Add MPN_PREFIX to a name.
+dnl MPN_PREFIX defaults to "__gmpn_" if not defined.
+
+ifdef(`MPN_PREFIX',,
+`define(`MPN_PREFIX',`__gmpn_')')
+
+define(MPN,
+m4_assert_numargs(1)
+`MPN_PREFIX`'$1')
+
+
+dnl Usage: mpn_add_n, etc
+dnl
+dnl Convenience definitions using MPN(), like the #defines in gmp.h. Each
+dnl function that might be implemented in assembler is here.
+
+define(define_mpn,
+m4_assert_numargs(1)
+`define(`mpn_$1',`MPN(`$1')')')
+
+define_mpn(add)
+define_mpn(add_1)
+define_mpn(add_n)
+define_mpn(add_nc)
+define_mpn(addmul_1)
+define_mpn(addmul_1c)
+define_mpn(addsub_n)
+define_mpn(addsub_nc)
+define_mpn(and_n)
+define_mpn(andn_n)
+define_mpn(bdivmod)
+define_mpn(cmp)
+define_mpn(com_n)
+define_mpn(copyd)
+define_mpn(copyi)
+define_mpn(divexact_by3c)
+define_mpn(divrem)
+define_mpn(divrem_1)
+define_mpn(divrem_1c)
+define_mpn(divrem_2)
+define_mpn(divrem_classic)
+define_mpn(divrem_newton)
+define_mpn(dump)
+define_mpn(gcd)
+define_mpn(gcd_1)
+define_mpn(gcdext)
+define_mpn(get_str)
+define_mpn(hamdist)
+define_mpn(invert_limb)
+define_mpn(ior_n)
+define_mpn(iorn_n)
+define_mpn(kara_mul_n)
+define_mpn(kara_sqr_n)
+define_mpn(lshift)
+define_mpn(lshiftc)
+define_mpn(mod_1)
+define_mpn(mod_1c)
+define_mpn(mul)
+define_mpn(mul_1)
+define_mpn(mul_1c)
+define_mpn(mul_basecase)
+define_mpn(mul_n)
+define_mpn(perfect_square_p)
+define_mpn(popcount)
+define_mpn(preinv_mod_1)
+define_mpn(nand_n)
+define_mpn(nior_n)
+define_mpn(random)
+define_mpn(random2)
+define_mpn(rshift)
+define_mpn(rshiftc)
+define_mpn(scan0)
+define_mpn(scan1)
+define_mpn(set_str)
+define_mpn(sqr_basecase)
+define_mpn(sub_n)
+define_mpn(sqrtrem)
+define_mpn(sub)
+define_mpn(sub_1)
+define_mpn(sub_n)
+define_mpn(sub_nc)
+define_mpn(submul_1)
+define_mpn(submul_1c)
+define_mpn(toom3_mul_n)
+define_mpn(toom3_sqr_n)
+define_mpn(umul_ppmm)
+define_mpn(udiv_qrnnd)
+define_mpn(xnor_n)
+define_mpn(xor_n)
+
+define(`ASM_START',
+ `')
+
+define(`PROLOGUE',
+ `
+ TEXT
+ ALIGN(4)
+ GLOBL GSYM_PREFIX`$1'
+ TYPE(GSYM_PREFIX`$1',`function')
+GSYM_PREFIX`$1':')
+
+define(`EPILOGUE',
+ `
+ SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')')
+
+dnl LSYM_PREFIX might be L$, so defn() must be used to quote it or the L
+dnl will expand as the L macro, an infinite recursion.
+define(`L',`defn(`LSYM_PREFIX')$1')
+
+define(`INT32',
+ `
+ ALIGN(4)
+$1:
+ W32 $2
+ ')
+
+define(`INT64',
+ `
+ ALIGN(8)
+$1:
+ W32 $2
+ W32 $3
+ ')
+
+
+dnl Usage: ALIGN(bytes)
+dnl
+dnl Emit a ".align" directive. The alignment is specified in bytes, and
+dnl will normally need to be a power of 2. The actual ".align" generated
+dnl is either bytes or logarithmic according to what ./configure detects.
+dnl
+dnl ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should
+dnl be appended (this is for x86).
+
+define(ALIGN,
+m4_assert_numargs(1)
+m4_assert_defined(`ALIGN_LOGARITHMIC')
+`.align ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl
+ifelse(ALIGN_FILL_0x90,yes,`, 0x90')')
+
+
+dnl Usage: MULFUNC_PROLOGUE(function function...)
+dnl
+dnl A dummy macro which is grepped for by ./configure to know what
+dnl functions a multi-function file is providing. Use this if there aren't
+dnl explicit PROLOGUE()s for each possible function.
+dnl
+dnl Multiple MULFUNC_PROLOGUEs can be used, or just one with the function
+dnl names separated by spaces.
+
+define(`MULFUNC_PROLOGUE',
+m4_assert_numargs(1)
+`')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/clipper/add_n.s b/rts/gmp/mpn/clipper/add_n.s
new file mode 100644
index 0000000000..538a1caed0
--- /dev/null
+++ b/rts/gmp/mpn/clipper/add_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+ .align 16
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry-save register
+
+.Loop: loadw (r1),r4
+ loadw (r2),r5
+ addwc r6,r6 ; restore carry from r6
+ addwc r5,r4
+ storw r4,(r0)
+ subwc r6,r6 ; save carry in r6
+ addq $4,r0
+ addq $4,r1
+ addq $4,r2
+ subq $1,r3
+ brne .Loop
+
+ negw r6,r0
+ loadw (sp),r6
+ addq $8,sp
+ ret sp
diff --git a/rts/gmp/mpn/clipper/mul_1.s b/rts/gmp/mpn/clipper/mul_1.s
new file mode 100644
index 0000000000..c0c756488c
--- /dev/null
+++ b/rts/gmp/mpn/clipper/mul_1.s
@@ -0,0 +1,47 @@
+; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+ .align 16
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry limb
+
+.Loop: loadw (r1),r4
+ mulwux r3,r4
+ addw r6,r4 ; add old carry limb into low product limb
+ loadq $0,r6
+ addwc r5,r6 ; propagate cy into high product limb
+ storw r4,(r0)
+ addq $4,r0
+ addq $4,r1
+ subq $1,r2
+ brne .Loop
+
+ movw r6,r0
+ loadw 0(sp),r6
+ addq $8,sp
+ ret sp
diff --git a/rts/gmp/mpn/clipper/sub_n.s b/rts/gmp/mpn/clipper/sub_n.s
new file mode 100644
index 0000000000..44d8797289
--- /dev/null
+++ b/rts/gmp/mpn/clipper/sub_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+ .align 16
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry-save register
+
+.Loop: loadw (r1),r4
+ loadw (r2),r5
+ addwc r6,r6 ; restore carry from r6
+ subwc r5,r4
+ storw r4,(r0)
+ subwc r6,r6 ; save carry in r6
+ addq $4,r0
+ addq $4,r1
+ addq $4,r2
+ subq $1,r3
+ brne .Loop
+
+ negw r6,r0
+ loadw (sp),r6
+ addq $8,sp
+ ret sp
diff --git a/rts/gmp/mpn/cray/README b/rts/gmp/mpn/cray/README
new file mode 100644
index 0000000000..8195c67e21
--- /dev/null
+++ b/rts/gmp/mpn/cray/README
@@ -0,0 +1,14 @@
+The (poorly optimized) code in this directory was originally written for a
+j90 system, but finished on a c90. It should work on all Cray vector
+computers. For the T3E and T3D systems, the `alpha' subdirectory at the
+same level as the directory containing this file, is much better.
+
+* `+' seems to be faster than `|' when combining carries.
+
+* It is possible that the best multiply performance would be achived by
+ storing only 24 bits per element, and using lazy carry propagation. Before
+ calling i24mult, full carry propagation would be needed.
+
+* Supply tasking versions of the C loops.
+
+
diff --git a/rts/gmp/mpn/cray/add_n.c b/rts/gmp/mpn/cray/add_n.c
new file mode 100644
index 0000000000..1fdb394993
--- /dev/null
+++ b/rts/gmp/mpn/cray/add_n.c
@@ -0,0 +1,96 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+ For Cray vector processors.
+
+ Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_add_n (c, a, b, n)
+ mp_ptr c;
+ mp_srcptr a, b;
+ mp_size_t n;
+{
+ mp_size_t i;
+ mp_size_t nm1 = n - 1;
+ int more_carries = 0;
+ int carry_out;
+
+ /* For small operands the non-vector code is faster. */
+ if (n < 16)
+ goto sequential;
+
+ if (a == c || b == c)
+ {
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ if (c == a)
+ {
+ /* allocate temp space for a */
+ mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ MPN_COPY (ax, a, n);
+ a = (mp_srcptr) ax;
+ }
+ if (c == b)
+ {
+ /* allocate temp space for b */
+ mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ MPN_COPY (bx, b, n);
+ b = (mp_srcptr) bx;
+ }
+ carry_out = mpn_add_n (c, a, b, n);
+ TMP_FREE (marker);
+ return carry_out;
+ }
+
+ carry_out = a[nm1] + b[nm1] < a[nm1];
+
+#pragma _CRI ivdep /* Cray PVP systems */
+ for (i = nm1; i > 0; i--)
+ {
+ int cy_in;
+ cy_in = a[i - 1] + b[i - 1] < a[i - 1];
+ c[i] = a[i] + b[i] + cy_in;
+ more_carries += c[i] < cy_in;
+ }
+ c[0] = a[0] + b[0];
+
+ if (more_carries)
+ {
+ /* This won't vectorize, but we should come here rarely. */
+ int cy;
+ sequential:
+ cy = 0;
+ for (i = 0; i < n; i++)
+ {
+ mp_limb_t ai, ci, t;
+ ai = a[i];
+ t = b[i] + cy;
+ cy = t < cy;
+ ci = ai + t;
+ cy += ci < ai;
+ c[i] = ci;
+ }
+ carry_out = cy;
+ }
+
+ return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/addmul_1.c b/rts/gmp/mpn/cray/addmul_1.c
new file mode 100644
index 0000000000..031b4e8e8d
--- /dev/null
+++ b/rts/gmp/mpn/cray/addmul_1.c
@@ -0,0 +1,46 @@
+/* mpn_addmul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+ mp_ptr p0, p1, tp;
+ mp_limb_t cy_limb;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+ GMPN_MULWW (p1, p0, up, &n, &limb);
+ cy_limb = mpn_add_n (tp, rp, p0, n);
+ rp[0] = tp[0];
+ cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1);
+ cy_limb += p1[n - 1];
+
+ TMP_FREE (marker);
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/gmp-mparam.h b/rts/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000000..14f7b8e05b
--- /dev/null
+++ b/rts/gmp/mpn/cray/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 64
+#define BITS_PER_SHORTINT 32
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/cray/mul_1.c b/rts/gmp/mpn/cray/mul_1.c
new file mode 100644
index 0000000000..0c8750b4ac
--- /dev/null
+++ b/rts/gmp/mpn/cray/mul_1.c
@@ -0,0 +1,44 @@
+/* mpn_mul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+ mp_ptr p0, p1;
+ mp_limb_t cy_limb;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+ GMPN_MULWW (p1, p0, up, &n, &limb);
+ rp[0] = p0[0];
+ cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1);
+ cy_limb += p1[n - 1];
+
+ TMP_FREE (marker);
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/mulww.f b/rts/gmp/mpn/cray/mulww.f
new file mode 100644
index 0000000000..99507c1e44
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.f
@@ -0,0 +1,54 @@
+c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+c This file is part of the GNU MP Library.
+
+c The GNU MP Library is free software; you can redistribute it and/or
+c modify it under the terms of the GNU Lesser General Public License as
+c published by the Free Software Foundation; either version 2.1 of the
+c License, or (at your option) any later version.
+
+c The GNU MP Library is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+c Lesser General Public License for more details.
+
+c You should have received a copy of the GNU Lesser General Public
+c License along with the GNU MP Library; see the file COPYING.LIB. If
+c not, write to the Free Software Foundation, Inc., 59 Temple Place -
+c Suite 330, Boston, MA 02111-1307, USA.
+
+c p1[] = hi(a[]*s); the upper limbs of each product
+c p0[] = low(a[]*s); the corresponding lower limbs
+c n is number of limbs in the vectors
+
+ subroutine gmpn_mulww(p1,p0,a,n,s)
+ integer*8 p1(0:*),p0(0:*),a(0:*),s
+ integer n
+
+ integer*8 a0,a1,a2,s0,s1,s2,c
+ integer*8 ai,t0,t1,t2,t3,t4
+
+ s0 = shiftl(and(s,4194303),24)
+ s1 = shiftl(and(shiftr(s,22),4194303),24)
+ s2 = shiftl(and(shiftr(s,44),4194303),24)
+
+ do i = 0,n-1
+ ai = a(i)
+ a0 = shiftl(and(ai,4194303),24)
+ a1 = shiftl(and(shiftr(ai,22),4194303),24)
+ a2 = shiftl(and(shiftr(ai,44),4194303),24)
+
+ t0 = i24mult(a0,s0)
+ t1 = i24mult(a0,s1)+i24mult(a1,s0)
+ t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0)
+ t3 = i24mult(a1,s2)+i24mult(a2,s1)
+ t4 = i24mult(a2,s2)
+
+ p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0
+ c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+
+ $ shiftl(and(t2,1048575),22),42)
+ p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c
+ end do
+ end
diff --git a/rts/gmp/mpn/cray/mulww.s b/rts/gmp/mpn/cray/mulww.s
new file mode 100644
index 0000000000..890cdcf94d
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.s
@@ -0,0 +1,245 @@
+* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+* Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+* This file is generated from mulww.f in this same directory.
+
+* This file is part of the GNU MP Library.
+
+* The GNU MP Library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public License as
+* published by the Free Software Foundation; either version 2.1 of the
+* License, or (at your option) any later version.
+
+* The GNU MP Library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+
+* You should have received a copy of the GNU Lesser General Public
+* License along with the GNU MP Library; see the file COPYING.LIB. If
+* not, write to the Free Software Foundation, Inc., 59 Temple Place -
+* Suite 330, Boston, MA 02111-1307, USA.
+
+ IDENT GMPN_MULWW
+**********************************************
+* Assemble with Cal Version 2.0 *
+* *
+* Generated by CFT77 6.0.4.19 *
+* on 06/27/00 at 04:34:13 *
+* *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+ EDIT OFF
+ FORMAT NEW
+@DATA SECTION DATA,CM
+@DATA = W.*
+ CON O'0000000000040000000000
+ CON O'0435152404713723252514 ;GMPN_MUL 1
+ CON O'0535270000000000000000 ;WW 1
+ CON O'0000000000000001200012 ;trbk tbl 1
+ VWD 32/0,32/P.GMPN_MULWW ;trbk tbl 1
+ CON O'0014003000000000001416 ;trbk tbl 1
+ CON O'0000000000000000000011 ;trbk tbl 1
+ CON O'0000000000000000000215 ;trbk tbl 1
+ BSSZ 1 ;trbk tbl 1
+@CODE SECTION CODE
+@CODE = P.*
+L3 = P.* ; 1
+ A0 A6 ;arg base 1
+ A5 6 ;num Darg 1
+ B03,A5 0,A0 ;load DAs 1
+ A0 A1+A2 ; 1
+ A5 1 ;num Ts 1
+ 0,A0 T00,A5 ; 1
+ B02 A2 ;new base 1
+ B66 A3 ;stk top 1
+ B01 A6 ;arg base 1
+ A7 P.L4 ;ofrn rtn 1
+ B00 A7 ;return 1
+ A6 @DATA ; 1
+ J $STKOFEN ;$STKOFEN 1
+GMPN_MULWW = P.* ; 1
+ A0 @DATA+3 ;(trbk) 1
+ B77 A0 ;(trbk) 1
+ A1 13 ;num Bs 1
+ A0 B66 ;stk top 1
+ A2 B66 ;stk tmp 1
+ A4 B67 ;stk limt 1
+ 0,A0 B77,A1 ; 1
+ A7 782 ;stk size 1
+ A3 A2+A7 ; 1
+ A0 A4-A3 ; 1
+ JAM L3 ;overflow 1
+ A0 A6 ;arg base 1
+ A5 6 ;num Darg 1
+ B03,A5 0,A0 ;load DAs 1
+ A0 A1+A2 ; 1
+ A5 1 ;num Ts 1
+ 0,A0 T00,A5 ; 1
+ B02 A2 ;new base 1
+ B66 A3 ;new top 1
+ B01 A6 ;arg base 1
+L4 = P.* ;ofrn rtn 1
+ A7 B07 ;regs 14
+ S7 0,A7 ; 14
+ A6 B10 ;regs 9
+ S6 0,A6 ; 9
+ S5 1 ; 14
+ S4 <22 ; 9
+ S7 S7-S5 ; 14
+ S5 #S7 ; 14
+ T00 S6 ;regs 10
+ S6 S6>22 ; 10
+ S7 T00 ;regs 11
+ S7 S7>44 ; 11
+ S3 T00 ;regs 9
+ S3 S3&S4 ; 9
+ S6 S6&S4 ; 10
+ S7 S7&S4 ; 11
+ S3 S3<24 ; 9
+ S6 S6<24 ; 10
+ S7 S7<24 ; 11
+ S0 S5 ;regs 14
+ S4 S5 ;regs 14
+ S1 S6 ;regs 14
+ S2 S3 ;regs 14
+ S3 S7 ;regs 14
+ JSP L5 ; 14
+L6 = P.* ; 14
+ S7 -S4 ; 14
+ A2 S7 ;regs 14
+ VL A2 ;regs 14
+ A3 B06 ;s_bt_sp 14
+ A5 B05 ;s_bt_sp 14
+ A4 B04 ;s_bt_sp 14
+ A1 VL ; 14
+ A2 S4 ;regs 14
+L7 = P.* ; 14
+ A0 A3 ;regs 15
+ VL A1 ;regs 15
+ V7 ,A0,1 ; 15
+ B11 A5 ;s_bt_sp 15
+ A7 22 ; 17
+ B12 A4 ;s_bt_sp 17
+ V6 V7>A7 ; 17
+ B13 A3 ;s_bt_sp 17
+ S7 <22 ; 17
+ A3 B02 ;s_bt_sp 17
+ V5 S7&V6 ; 17
+ A6 24 ; 17
+ V4 V5<A6 ; 17
+ V3 S1*FV4 ; 22
+ V2 S7&V7 ; 16
+ V1 V2<A6 ; 16
+ V0 S3*FV1 ; 22
+ V6 V0+V3 ; 22
+ A5 44 ; 18
+ V5 V7>A5 ; 18
+ V2 S1*FV1 ; 21
+ V3 S7&V5 ; 18
+ A0 14 ; 34
+ B77 A0 ;regs 34
+ A4 B77 ;regs 34
+ A0 A4+A3 ; 34
+ ,A0,1 V2 ;v_ld_str 34
+ V0 V3<A6 ; 18
+ V7 S2*FV1 ; 20
+ A4 142 ; 34
+ A0 A4+A3 ; 34
+ ,A0,1 V7 ;v_ld_str 34
+ V5 V7>A7 ; 28
+ V2 S2*FV0 ; 22
+ V3 V6+V2 ; 22
+ S7 <20 ; 28
+ V1 S7&V3 ; 28
+ A4 270 ; 34
+ A0 A4+A3 ; 34
+ ,A0,1 V0 ;v_ld_str 34
+ A4 14 ; 34
+ A0 A4+A3 ; 34
+ V7 ,A0,1 ;v_ld_str 34
+ V6 V1<A7 ; 28
+ V2 S2*FV4 ; 21
+ V0 V7+V2 ; 21
+ S7 <42 ; 28
+ V1 S7&V0 ; 28
+ A4 398 ; 34
+ A0 A4+A3 ; 34
+ ,A0,1 V0 ;v_ld_str 34
+ V7 S3*FV4 ; 23
+ V2 V5+V1 ; 28
+ V0 V3<A5 ; 26
+ A5 526 ; 34
+ A0 A5+A3 ; 34
+ ,A0,1 V0 ;v_ld_str 34
+ A5 270 ; 34
+ A0 A5+A3 ; 34
+ V4 ,A0,1 ;v_ld_str 34
+ V5 V2+V6 ; 28
+ A5 20 ; 32
+ V1 V3>A5 ; 32
+ V0 S1*FV4 ; 23
+ A5 654 ; 34
+ A0 A5+A3 ; 34
+ ,A0,1 V1 ;v_ld_str 34
+ V6 V7+V0 ; 23
+ A5 2 ; 32
+ V2 V6<A5 ; 32
+ V3 S3*FV4 ; 24
+ A5 142 ; 34
+ A0 A5+A3 ; 34
+ V1 ,A0,1 ;v_ld_str 34
+ A5 526 ; 34
+ A0 A5+A3 ; 34
+ V7 ,A0,1 ;v_ld_str 34
+ V0 V1+V7 ; 26
+ V6 V3<A6 ; 32
+ V4 V6+V2 ; 32
+ A6 42 ; 28
+ V7 V5>A6 ; 28
+ A5 654 ; 34
+ CPW ;cmr_vrsp 34
+ A0 A5+A3 ; 34
+ V1 ,A0,1 ;v_ld_str 34
+ A5 398 ; 34
+ A0 A5+A3 ; 34
+ V3 ,A0,1 ;v_ld_str 34
+ V6 V4+V1 ; 32
+ V2 V3>A6 ; 32
+ V5 V6+V2 ; 32
+ A6 B12 ;s_bt_sp 32
+ V4 V3<A7 ; 26
+ A7 B13 ;regs 34
+ A3 A7+A1 ; 34
+ A7 B11 ;regs 34
+ A5 A7+A1 ; 34
+ A4 A6+A1 ; 34
+ A7 A2+A1 ; 34
+ A0 A2+A1 ; 34
+ A2 128 ; 34
+ B13 A0 ;s_bt_sp 34
+ V1 V0+V4 ; 26
+ A0 B11 ;regs 31
+ ,A0,1 V1 ; 31
+ V6 V5+V7 ; 33
+ A0 A6 ;regs 33
+ ,A0,1 V6 ; 33
+ A0 B13 ;regs 34
+ A1 A2 ;regs 34
+ A2 A7 ;regs 34
+ JAN L7 ; 34
+L8 = P.* ; 34
+L5 = P.* ; 34
+ S1 0 ; 35
+ A0 B02 ; 35
+ A2 B02 ; 35
+ A1 13 ;num Bs 35
+ B66 A0 ; 35
+ B77,A1 0,A0 ; 35
+ A0 A2+A1 ; 35
+ A1 1 ;num Ts 35
+ T00,A1 0,A0 ; 35
+ J B00 ; 35
+ EXT $STKOFEN:p
+ ENTRY GMPN_MULWW
+ END
diff --git a/rts/gmp/mpn/cray/sub_n.c b/rts/gmp/mpn/cray/sub_n.c
new file mode 100644
index 0000000000..902e07a727
--- /dev/null
+++ b/rts/gmp/mpn/cray/sub_n.c
@@ -0,0 +1,97 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+ For Cray vector processors.
+
+ Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_sub_n (c, a, b, n)
+ mp_ptr c;
+ mp_srcptr a, b;
+ mp_size_t n;
+{
+ mp_size_t i;
+ mp_size_t nm1 = n - 1;
+ int more_carries = 0;
+ int carry_out;
+
+ /* For small operands the non-vector code is faster. */
+ if (n < 16)
+ goto sequential;
+
+ if (a == c || b == c)
+ {
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ if (c == a)
+ {
+ /* allocate temp space for a */
+ mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ MPN_COPY (ax, a, n);
+ a = (mp_srcptr) ax;
+ }
+ if (c == b)
+ {
+ /* allocate temp space for b */
+ mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ MPN_COPY (bx, b, n);
+ b = (mp_srcptr) bx;
+ }
+ carry_out = mpn_sub_n (c, a, b, n);
+ TMP_FREE (marker);
+ return carry_out;
+ }
+
+ carry_out = a[nm1] < b[nm1];
+
+#pragma _CRI ivdep /* Cray PVP systems */
+ for (i = nm1; i > 0; i--)
+ {
+ int cy_in; mp_limb_t t;
+ cy_in = a[i - 1] < b[i - 1];
+ t = a[i] - b[i];
+ more_carries += t < cy_in;
+ c[i] = t - cy_in;
+ }
+ c[0] = a[0] - b[0];
+
+ if (more_carries)
+ {
+ /* This won't vectorize, but we should come here rarely. */
+ int cy;
+ sequential:
+ cy = 0;
+ for (i = 0; i < n; i++)
+ {
+ mp_limb_t ai, ci, t;
+ ai = a[i];
+ t = b[i] + cy;
+ cy = t < cy;
+ ci = ai - t;
+ cy += ci > ai;
+ c[i] = ci;
+ }
+ carry_out = cy;
+ }
+
+ return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/submul_1.c b/rts/gmp/mpn/cray/submul_1.c
new file mode 100644
index 0000000000..4d2fb13c62
--- /dev/null
+++ b/rts/gmp/mpn/cray/submul_1.c
@@ -0,0 +1,46 @@
+/* mpn_submul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+ mp_ptr p0, p1, tp;
+ mp_limb_t cy_limb;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+ GMPN_MULWW (p1, p0, up, &n, &limb);
+ cy_limb = mpn_sub_n (tp, rp, p0, n);
+ rp[0] = tp[0];
+ cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1);
+ cy_limb += p1[n - 1];
+
+ TMP_FREE (marker);
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000000..5fcb7e4835
--- /dev/null
+++ b/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ register mp_srcptr s2_ptr;
+ mp_size_t size;
+#endif
+{
+ register mp_limb_t x, y, cy;
+ register mp_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do
+ {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to one addend */
+ cy = (y < cy); /* get out carry from that addition */
+ y = x + y; /* add other addend */
+ cy = (y < x) + cy; /* get out carry from that add, combine */
+ res_ptr[j] = y;
+ }
+ while (++j != 0);
+
+ return cy;
+}
diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000000..746ae31307
--- /dev/null
+++ b/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+ by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+ limb vector pointed to by RES_PTR. Return the most significant limb of
+ the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+ register mp_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x + prod_low;
+ cy_limb += (prod_low < x);
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c
new file mode 100644
index 0000000000..c9bab3ef60
--- /dev/null
+++ b/rts/gmp/mpn/generic/addsub_n.c
@@ -0,0 +1,167 @@
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
+
+
+/* mpn_addsub_n.
+ r1[] = s1[] + s2[]
+ r2[] = s1[] - s2[]
+ All operands have n limbs.
+ In-place operations allowed. */
+mp_limb_t
+#if __STDC__
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+#else
+mpn_addsub_n (r1p, r2p, s1p, s2p, n)
+ mp_ptr r1p, r2p;
+ mp_srcptr s1p, s2p;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t acyn, acyo; /* carry for add */
+ mp_limb_t scyn, scyo; /* carry for subtract */
+ mp_size_t off; /* offset in operands */
+ mp_size_t this_n; /* size of current chunk */
+
+ /* We alternatingly add and subtract in chunks that fit into the (L1)
+ cache. Since the chunks are several hundred limbs, the function call
+ overhead is insignificant, but we get much better locality. */
+
+ /* We have three variant of the inner loop, the proper loop is chosen
+ depending on whether r1 or r2 are the same operand as s1 or s2. */
+
+ if (r1p != s1p && r1p != s2p)
+ {
+ /* r1 is not identical to either input operand. We can therefore write
+ to r1 directly, without using temporary storage. */
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+ }
+ }
+ else if (r2p != s1p && r2p != s2p)
+ {
+ /* r2 is not identical to either input operand. We can therefore write
+ to r2 directly, without using temporary storage. */
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+ }
+ }
+ else
+ {
+ /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
+ Need temporary storage. */
+ mp_limb_t tp[PART_SIZE];
+ acyo = 0;
+ scyo = 0;
+ for (off = 0; off < n; off += PART_SIZE)
+ {
+ this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+ acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+ acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+ acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+ scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+ scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+ scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+ MPN_COPY (r1p + off, tp, this_n);
+ }
+ }
+
+ return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+ mp_ptr r1p, r2p, s1p, s2p;
+ double t;
+ mp_size_t n;
+
+ n = strtol (argv[1], 0, 0);
+
+ r1p = malloc (n * BYTES_PER_MP_LIMB);
+ r2p = malloc (n * BYTES_PER_MP_LIMB);
+ s1p = malloc (n * BYTES_PER_MP_LIMB);
+ s2p = malloc (n * BYTES_PER_MP_LIMB);
+ TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+ printf (" separate add and sub: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
+ printf ("combined addsub separate variables: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+ printf (" combined addsub r1 overlap: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+ printf (" combined addsub r2 overlap: %.3f\n", t);
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
+ printf (" combined addsub in-place: %.3f\n", t);
+
+ return 0;
+}
+#endif
diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..c4bcb414e6
--- /dev/null
+++ b/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,120 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+ Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+ returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+ Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up. Since the
+ low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+ the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+ Preconditions:
+ 1. V is odd.
+ 2. usize * BITS_PER_MP_LIMB >= d.
+ 3. If Q and U overlap, qp <= up.
+
+ Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+ Funding for this work has been partially provided by Conselho Nacional
+ de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+ 301314194-2, and was done while I was a visiting reseacher in the Instituto
+ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+ References:
+ T. Jebelean, An algorithm for exact division, Journal of Symbolic
+ Computation, v. 15, 1993, pp. 169-180.
+
+ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+ Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+ mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+ mp_ptr qp;
+ mp_ptr up;
+ mp_size_t usize;
+ mp_srcptr vp;
+ mp_size_t vsize;
+ unsigned long int d;
+#endif
+{
+ mp_limb_t v_inv;
+
+ /* 1/V mod 2^BITS_PER_MP_LIMB. */
+ modlimb_invert (v_inv, vp[0]);
+
+ /* Fast code for two cases previously used by the accel part of mpn_gcd.
+ (Could probably remove this now it's inlined there.) */
+ if (usize == 2 && vsize == 2 &&
+ (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+ {
+ mp_limb_t hi, lo;
+ mp_limb_t q = up[0] * v_inv;
+ umul_ppmm (hi, lo, q, vp[0]);
+ up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+ if (d == 2*BITS_PER_MP_LIMB)
+ q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+ return 0;
+ }
+
+ /* Main loop. */
+ while (d >= BITS_PER_MP_LIMB)
+ {
+ mp_limb_t q = up[0] * v_inv;
+ mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ d -= BITS_PER_MP_LIMB;
+ up += 1, usize -= 1;
+ *qp++ = q;
+ }
+
+ if (d)
+ {
+ mp_limb_t b;
+ mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+ if (q <= 1)
+ {
+ if (q == 0)
+ return 0;
+ else
+ b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+ }
+ else
+ b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ return q;
+ }
+
+ return 0;
+}
diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c
new file mode 100644
index 0000000000..d234b22af5
--- /dev/null
+++ b/rts/gmp/mpn/generic/bz_divrem_n.c
@@ -0,0 +1,153 @@
+/* mpn_bz_divrem_n and auxilliary routines.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+ Technical report MPI-I-98-1-022, october 1998.
+ http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_bz_div_3_halves_by_2
+ _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
+
+
+/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
+ div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
+ i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#if 0
+static
+unused_mpn_divrem (qp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+{
+ /* This might be useful: */
+ if (qxn != 0)
+ {
+ mp_limb_t c;
+ mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB);
+ MPN_COPY (tp + qxn - nn, np, nn);
+ MPN_ZERO (tp, qxn);
+ c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn);
+ /* Maybe copy proper part of tp to np? Documentation is unclear about
+ the returned np value when qxn != 0 */
+ return c;
+ }
+}
+#endif
+
+
+/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+ by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+ Returns most significant limb of the quotient, which is 0 or 1.
+ Requires that the most significant bit of the divisor is set. */
+
+mp_limb_t
+#if __STDC__
+mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_divrem_n (qp, np, dp, n)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_srcptr dp;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t qhl, cc;
+
+ if (n % 2 != 0)
+ {
+ qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
+ cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
+ cc = mpn_sub_1 (np + n, np + n, 1, cc);
+ if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+ while (cc)
+ {
+ qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
+ cc -= mpn_add_n (np + 1, np + 1, dp, n);
+ }
+ qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
+ mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+ }
+ else
+ {
+ mp_size_t n2 = n/2;
+ qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
+ qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+ mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
+ }
+ return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+ the remainder in (np, 2n) */
+
+static mp_limb_t
+#if __STDC__
+mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_srcptr dp;
+ mp_size_t n;
+#endif
+{
+ mp_size_t twon = n + n;
+ mp_limb_t qhl, cc;
+ mp_ptr tmp;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ if (n < BZ_THRESHOLD)
+ qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+ else
+ qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
+ tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
+ mpn_mul_n (tmp, qp, dp, n);
+ cc = mpn_sub_n (np, np, tmp, twon);
+ TMP_FREE (marker);
+ if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
+ while (cc)
+ {
+ qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+ cc -= mpn_add_n (np, np, dp, twon);
+ }
+ return qhl;
+}
diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000000..8e9792f54e
--- /dev/null
+++ b/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+ There are no restrictions on the relative sizes of
+ the two arguments.
+ Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2. */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+ mp_srcptr op1_ptr;
+ mp_srcptr op2_ptr;
+ mp_size_t size;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t op1_word, op2_word;
+
+ for (i = size - 1; i >= 0; i--)
+ {
+ op1_word = op1_ptr[i];
+ op2_word = op2_ptr[i];
+ if (op1_word != op2_word)
+ goto diff;
+ }
+ return 0;
+ diff:
+ /* This can *not* be simplified to
+ op2_word - op2_word
+ since that expression might give signed overflow. */
+ return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000000..a2fb552bfa
--- /dev/null
+++ b/rts/gmp/mpn/generic/diveby3.c
@@ -0,0 +1,77 @@
+/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+ 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+
+/* The "c += ..."s are adding the high limb of 3*l to c. That high limb
+ will be 0, 1 or 2. Doing two separate "+="s seems to turn out better
+ code on gcc (as of 2.95.2 at least).
+
+ When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
+ limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
+ gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
+ only 0 or 1 respectively. Hence the carry out of each stage and for the
+ return value is always only 0, 1 or 2. */
+
+mp_limb_t
+#if __STDC__
+mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
+#else
+mpn_divexact_by3c (dst, src, size, c)
+ mp_ptr dst;
+ mp_srcptr src;
+ mp_size_t size;
+ mp_limb_t c;
+#endif
+{
+ mp_size_t i;
+
+ ASSERT (size >= 1);
+
+ i = 0;
+ do
+ {
+ mp_limb_t l, s;
+
+ s = src[i];
+ l = s - c;
+ c = (l > s);
+
+ l *= INVERSE_3;
+ dst[i] = l;
+
+ c += (l > MP_LIMB_T_MAX/3);
+ c += (l > (MP_LIMB_T_MAX/3)*2);
+ }
+ while (++i < size);
+
+ return c;
+}
diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000000..30673e76d9
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,101 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+ quotient. This is now just a middle layer for calling the new
+ internal mpn_tdiv_qr.
+
+Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn)
+#else
+mpn_divrem (qp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+#endif
+{
+ if (dn == 1)
+ {
+ mp_limb_t ret;
+ mp_ptr q2p;
+ mp_size_t qn;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+
+ np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+ qn = nn + qxn - 1;
+ MPN_COPY (qp, q2p, qn);
+ ret = q2p[qn];
+
+ TMP_FREE (marker);
+ return ret;
+ }
+ else if (dn == 2)
+ {
+ return mpn_divrem_2 (qp, qxn, np, nn, dp);
+ }
+ else
+ {
+ mp_ptr rp, q2p;
+ mp_limb_t qhl;
+ mp_size_t qn;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ if (qxn != 0)
+ {
+ mp_ptr n2p;
+ n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+ MPN_ZERO (n2p, qxn);
+ MPN_COPY (n2p + qxn, np, nn);
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
+ MPN_COPY (np, rp, dn);
+ qn = nn - dn + qxn;
+ MPN_COPY (qp, q2p, qn);
+ qhl = q2p[qn];
+ }
+ else
+ {
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
+ MPN_COPY (np, rp, dn); /* overwrite np area with remainder */
+ qn = nn - dn;
+ MPN_COPY (qp, q2p, qn);
+ qhl = q2p[qn];
+ }
+ TMP_FREE (marker);
+ return qhl;
+ }
+}
diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000000..e93f241c9d
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,248 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+ QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+ QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+static mp_limb_t
+#if __STDC__
+__gmpn_divmod_1_internal (mp_ptr quot_ptr,
+ mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb)
+#else
+__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+ mp_ptr quot_ptr;
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n1, n0, r;
+ int dummy;
+
+ /* ??? Should this be handled at all? Rely on callers? */
+ if (dividend_size == 0)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ dividend is large, pre-invert the divisor, and use
+ only multiplications in the inner loop. */
+
+ /* This test should be read:
+ Does it ever help to use udiv_qrnnd_preinv?
+ && Does what we save compensate for the inversion overhead? */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ udiv_qrnnd_preinv (quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ }
+ else
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ {
+ quot_ptr[i] = 0;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (quot_ptr[i], r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ }
+ else
+ {
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd (quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ it, or because DIVISOR_LIMB is already normalized. */
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ {
+ quot_ptr[i] = 0;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
+
+
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+ mp_srcptr np, mp_size_t nn,
+ mp_limb_t d)
+#else
+mpn_divrem_1 (qp, qxn, np, nn, d)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_srcptr np;
+ mp_size_t nn;
+ mp_limb_t d;
+#endif
+{
+ mp_limb_t rlimb;
+ mp_size_t i;
+
+ /* Develop integer part of quotient. */
+ rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
+
+ /* Develop fraction part of quotient. This is not as fast as it should;
+ the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
+ too. */
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, d);
+ if (normalization_steps != 0)
+ {
+ d <<= normalization_steps;
+ rlimb <<= normalization_steps;
+
+ for (i = qxn - 1; i >= 0; i--)
+ udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+ return rlimb >> normalization_steps;
+ }
+ else
+ /* fall out */
+ ;
+ }
+
+ for (i = qxn - 1; i >= 0; i--)
+ udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+ return rlimb;
+}
diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000000..0bc31ae2e7
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_2.c
@@ -0,0 +1,151 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+ quotient. The divisor is two limbs.
+
+ THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+ RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+ the NSIZE-2 least significant quotient limbs at QP
+ and the 2 long remainder at NP. If QEXTRA_LIMBS is
+ non-zero, generate that many fraction bits and append them after the
+ other quotient limbs.
+ Return the most significant limb of the quotient, this is always 0 or 1.
+
+ Preconditions:
+ 0. NSIZE >= 2.
+ 1. The most significant bit of the divisor must be set.
+ 2. QP must either not overlap with the input operands at all, or
+ QP + 2 >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+ mp_ptr np, mp_size_t nsize,
+ mp_srcptr dp)
+#else
+mpn_divrem_2 (qp, qxn, np, nsize, dp)
+ mp_ptr qp;
+ mp_size_t qxn;
+ mp_ptr np;
+ mp_size_t nsize;
+ mp_srcptr dp;
+#endif
+{
+ mp_limb_t most_significant_q_limb = 0;
+ mp_size_t i;
+ mp_limb_t n1, n0, n2;
+ mp_limb_t d1, d0;
+ mp_limb_t d1inv;
+ int have_preinv;
+
+ np += nsize - 2;
+ d1 = dp[1];
+ d0 = dp[0];
+ n1 = np[1];
+ n0 = np[0];
+
+ if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ most_significant_q_limb = 1;
+ }
+
+ /* If multiplication is much faster than division, preinvert the most
+ significant divisor limb before entering the loop. */
+ if (UDIV_TIME > 2 * UMUL_TIME + 6)
+ {
+ have_preinv = 0;
+ if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
+ {
+ invert_limb (d1inv, d1);
+ have_preinv = 1;
+ }
+ }
+
+ for (i = qxn + nsize - 2 - 1; i >= 0; i--)
+ {
+ mp_limb_t q;
+ mp_limb_t r;
+
+ if (i >= qxn)
+ np--;
+ else
+ np[0] = 0;
+
+ if (n1 == d1)
+ {
+ /* Q should be either 111..111 or 111..110. Need special treatment
+ of this rare case as normal division would give overflow. */
+ q = ~(mp_limb_t) 0;
+
+ r = n0 + d1;
+ if (r < d1) /* Carry in the addition? */
+ {
+ add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+ qp[i] = q;
+ continue;
+ }
+ n1 = d0 - (d0 != 0);
+ n0 = -d0;
+ }
+ else
+ {
+ if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
+ udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+ else
+ udiv_qrnnd (q, r, n1, n0, d1);
+ umul_ppmm (n1, n0, d0, q);
+ }
+
+ n2 = np[0];
+
+ q_test:
+ if (n1 > r || (n1 == r && n0 > n2))
+ {
+ /* The estimated Q was too large. */
+ q--;
+
+ sub_ddmmss (n1, n0, n1, n0, 0, d0);
+ r += d1;
+ if (r >= d1) /* If not carry, test Q again. */
+ goto q_test;
+ }
+
+ qp[i] = q;
+ sub_ddmmss (n1, n0, r, n2, n1, n0);
+ }
+ np[1] = n1;
+ np[0] = n0;
+
+ return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000000..66f375c74b
--- /dev/null
+++ b/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,76 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO
+ CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+ FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpn_dump (mp_srcptr ptr, mp_size_t size)
+#else
+mpn_dump (ptr, size)
+ mp_srcptr ptr;
+ mp_size_t size;
+#endif
+{
+ MPN_NORMALIZE (ptr, size);
+
+ if (size == 0)
+ printf ("0\n");
+ else
+ {
+ size--;
+ if (BYTES_PER_MP_LIMB > sizeof (long))
+ {
+ if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0)
+ {
+ printf ("%lX",
+ (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%lX", (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%lX", ptr[size]);
+
+ while (size)
+ {
+ size--;
+ if (BYTES_PER_MP_LIMB > sizeof (long))
+ {
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+ printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+ (unsigned long) ptr[size]);
+ }
+ else
+ printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+ }
+ printf ("\n");
+ }
+}
diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000000..059e219a06
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,414 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+ the accelerated algorithm (see reference below).
+
+ mp_size_t mpn_gcd (up, usize, vp, vsize).
+
+ Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+ 1. V is odd.
+ 2. numbits(U) >= numbits(V).
+
+ Both U and V are destroyed by the operation. The result is left at vp,
+ and its size is returned.
+
+ Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+ Funding for this work has been partially provided by Conselho Nacional
+ de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+ 301314194-2, and was done while I was a visiting reseacher in the Instituto
+ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+ Refer to
+ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+ Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+ algorithm is used, otherwise the binary algorithm is used. This may be
+ adjusted for different architectures. */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+ algorithm reduces using the bmod operation. Otherwise, the k-ary reduction
+ is used. 0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB. */
+enum
+ {
+ BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+ };
+
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+ Both U and V must be odd. */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+ mp_ptr vp;
+ mp_srcptr up;
+#endif
+{
+ mp_limb_t u0, u1, v0, v1;
+ mp_size_t vsize;
+
+ u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+ while (u1 != v1 && u0 != v0)
+ {
+ unsigned long int r;
+ if (u1 > v1)
+ {
+ u1 -= v1 + (u0 < v0), u0 -= v0;
+ count_trailing_zeros (r, u0);
+ u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+ u1 >>= r;
+ }
+ else /* u1 < v1. */
+ {
+ v1 -= u1 + (v0 < u0), v0 -= u0;
+ count_trailing_zeros (r, v0);
+ v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+ v1 >>= r;
+ }
+ }
+
+ vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+ /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
+ if (u1 == v1 && u0 == v0)
+ return vsize;
+
+ v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+ vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+ return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+ 0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+ In the reference article, D was computed along with N, but it is better to
+ compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+ the result as a twos' complement signed integer.
+
+ Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB). According to the reference
+ article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+ 2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+ precision. If N2 > N1 initially, the first iteration of the while loop
+ will swap them. In all other situations, N1 >= N2 is maintained. */
+
+static
+#if ! defined (__i386__)
+__gmp_inline /* don't inline this for the x86 */
+#endif
+mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+ mp_srcptr cp;
+#endif
+{
+ unsigned long int leading_zero_bits = 0;
+
+ mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l. */
+ mp_limb_t n1_h = cp[1];
+
+ mp_limb_t n2_l = -n1_l; /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l. */
+ mp_limb_t n2_h = ~n1_h;
+
+ /* Main loop. */
+ while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */
+ {
+ /* N1 <-- N1 % N2. */
+ if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
+ {
+ unsigned long int i;
+ count_leading_zeros (i, n2_h);
+ i -= leading_zero_bits, leading_zero_bits += i;
+ n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+ do
+ {
+ if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+ n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+ n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+ i -= 1;
+ }
+ while (i);
+ }
+ if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+ n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+ MP_LIMB_T_SWAP (n1_h, n2_h);
+ MP_LIMB_T_SWAP (n1_l, n2_l);
+ }
+
+ return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, usize, vp, vsize)
+ mp_ptr gp;
+ mp_ptr up;
+ mp_size_t usize;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+{
+ mp_ptr orig_vp = vp;
+ mp_size_t orig_vsize = vsize;
+ int binary_gcd_ctr; /* Number of times binary gcd will execute. */
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+
+ /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
+ Two EXTRA limbs for U and V are required for kary reduction. */
+ if (vsize >= GCD_ACCEL_THRESHOLD)
+ {
+ unsigned long int vbitsize, d;
+ mp_ptr orig_up = up;
+ mp_size_t orig_usize = usize;
+ mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+ MPN_COPY (anchor_up, orig_up, usize);
+ up = anchor_up;
+
+ count_leading_zeros (d, up[usize-1]);
+ d = usize * BITS_PER_MP_LIMB - d;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize + 1;
+
+ /* Use bmod reduction to quickly discover whether V divides U. */
+ up[usize++] = 0; /* Insert leading zero. */
+ mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+ /* Now skip U/V mod 2^d and any low zero limbs. */
+ d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+ while (usize != 0 && up[0] == 0)
+ up++, usize--;
+
+ if (usize == 0) /* GCD == ORIG_V. */
+ goto done;
+
+ vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+ MPN_COPY (vp, orig_vp, vsize);
+
+ do /* Main loop. */
+ {
+ /* mpn_com_n can't be used here because anchor_up and up may
+ partially overlap */
+ if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */
+ {
+ mp_size_t i;
+ anchor_up[0] = -up[0];
+ for (i = 1; i < usize; i++)
+ anchor_up[i] = ~up[i];
+ up = anchor_up;
+ }
+
+ MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+ if ((up[0] & 1) == 0) /* Result even; remove twos. */
+ {
+ unsigned int r;
+ count_trailing_zeros (r, up[0]);
+ mpn_rshift (anchor_up, up, usize, r);
+ usize -= (anchor_up[usize-1] == 0);
+ }
+ else if (anchor_up != up)
+ MPN_COPY_INCR (anchor_up, up, usize);
+
+ MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
+ up = anchor_up;
+
+ if (vsize <= 2) /* Kary can't handle < 2 limbs and */
+ break; /* isn't efficient for == 2 limbs. */
+
+ d = vbitsize;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize + 1;
+
+ if (d > BMOD_THRESHOLD) /* Bmod reduction. */
+ {
+ up[usize++] = 0;
+ mpn_bdivmod (up, up, usize, vp, vsize, d);
+ d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+ }
+ else /* Kary reduction. */
+ {
+ mp_limb_t bp[2], cp[2];
+
+ /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */
+ {
+ mp_limb_t u_inv, hi, lo;
+ modlimb_invert (u_inv, up[0]);
+ cp[0] = vp[0] * u_inv;
+ umul_ppmm (hi, lo, cp[0], up[0]);
+ cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
+ }
+
+ /* U <-- find_a (C) * U. */
+ up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+ usize++;
+
+ /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+ bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+ bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
+
+ Like V/U above, but simplified because only the low bit of
+ bp[1] is wanted. */
+ {
+ mp_limb_t v_inv, hi, lo;
+ modlimb_invert (v_inv, vp[0]);
+ bp[0] = up[0] * v_inv;
+ umul_ppmm (hi, lo, bp[0], vp[0]);
+ bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
+ }
+
+ up[usize++] = 0;
+ if (bp[1]) /* B < 0: U <-- U + (-B) * V. */
+ {
+ mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+ mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+ }
+ else /* B >= 0: U <-- U - B * V. */
+ {
+ mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ }
+
+ up += 2, usize -= 2; /* At least two low limbs are zero. */
+ }
+
+ /* Must remove low zero limbs before complementing. */
+ while (usize != 0 && up[0] == 0)
+ up++, usize--;
+ }
+ while (usize);
+
+ /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */
+ up = orig_up, usize = orig_usize;
+ binary_gcd_ctr = 2;
+ }
+ else
+ binary_gcd_ctr = 1;
+
+ /* Finish up with the binary algorithm. Executes once or twice. */
+ for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+ {
+ if (usize > 2) /* First make U close to V in size. */
+ {
+ unsigned long int vbitsize, d;
+ count_leading_zeros (d, up[usize-1]);
+ d = usize * BITS_PER_MP_LIMB - d;
+ count_leading_zeros (vbitsize, vp[vsize-1]);
+ vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+ d = d - vbitsize - 1;
+ if (d != -(unsigned long int)1 && d > 2)
+ {
+ mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */
+ d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+ }
+ }
+
+ /* Start binary GCD. */
+ do
+ {
+ mp_size_t zeros;
+
+ /* Make sure U is odd. */
+ MPN_NORMALIZE (up, usize);
+ while (up[0] == 0)
+ up += 1, usize -= 1;
+ if ((up[0] & 1) == 0)
+ {
+ unsigned int r;
+ count_trailing_zeros (r, up[0]);
+ mpn_rshift (up, up, usize, r);
+ usize -= (up[usize-1] == 0);
+ }
+
+ /* Keep usize >= vsize. */
+ if (usize < vsize)
+ MPN_PTR_SWAP (up, usize, vp, vsize);
+
+ if (usize <= 2) /* Double precision. */
+ {
+ if (vsize == 1)
+ vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+ else
+ vsize = gcd_2 (vp, up);
+ break; /* Binary GCD done. */
+ }
+
+ /* Count number of low zero limbs of U - V. */
+ for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+ continue;
+
+ /* If U < V, swap U and V; in any case, subtract V from U. */
+ if (zeros == vsize) /* Subtract done. */
+ up += zeros, usize -= zeros;
+ else if (usize == vsize)
+ {
+ mp_size_t size = vsize;
+ do
+ size--;
+ while (up[size] == vp[size]);
+ if (up[size] < vp[size]) /* usize == vsize. */
+ MP_PTR_SWAP (up, vp);
+ up += zeros, usize = size + 1 - zeros;
+ mpn_sub_n (up, up, vp + zeros, usize);
+ }
+ else
+ {
+ mp_size_t size = vsize - zeros;
+ up += zeros, usize -= zeros;
+ if (mpn_sub_n (up, up, vp + zeros, size))
+ {
+ while (up[size] == 0) /* Propagate borrow. */
+ up[size++] = -(mp_limb_t)1;
+ up[size] -= 1;
+ }
+ }
+ }
+ while (usize); /* End binary GCD. */
+ }
+
+done:
+ if (vp != gp)
+ MPN_COPY (gp, vp, vsize);
+ TMP_FREE (marker);
+ return vsize;
+}
diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000000..1832636636
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,77 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0. It would be tough to make it work for
+ V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. */
+
+mp_limb_t
+#if __STDC__
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+#else
+mpn_gcd_1 (up, size, vlimb)
+ mp_srcptr up;
+ mp_size_t size;
+ mp_limb_t vlimb;
+#endif
+{
+ mp_limb_t ulimb;
+ unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+ if (size > 1)
+ {
+ ulimb = mpn_mod_1 (up, size, vlimb);
+ if (ulimb == 0)
+ return vlimb;
+ }
+ else
+ ulimb = up[0];
+
+ /* Need to eliminate low zero bits. */
+ count_trailing_zeros (u_low_zero_bits, ulimb);
+ ulimb >>= u_low_zero_bits;
+
+ count_trailing_zeros (v_low_zero_bits, vlimb);
+ vlimb >>= v_low_zero_bits;
+
+ while (ulimb != vlimb)
+ {
+ if (ulimb > vlimb)
+ {
+ ulimb -= vlimb;
+ do
+ ulimb >>= 1;
+ while ((ulimb & 1) == 0);
+ }
+ else /* vlimb > ulimb. */
+ {
+ vlimb -= ulimb;
+ do
+ vlimb >>= 1;
+ while ((vlimb & 1) == 0);
+ }
+ }
+
+ return ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000000..fe22d779a6
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,700 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 17
+#endif
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+
+/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
+
+ Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
+ greatest common divisor at GP (unless it is 0), and the first cofactor at
+ SP. Write the size of the cofactor through the pointer SSIZE. Return the
+ size of the value at GP. Note that SP might be a negative number; this is
+ denoted by storing the negative of the size through SSIZE.
+
+ {UP,USIZE} and {VP,VSIZE} are both clobbered.
+
+ The space allocation for all four areas needs to be USIZE+1.
+
+ Preconditions: 1) U >= V.
+ 2) V > 0. */
+
+/* We use Lehmer's algorithm. The idea is to extract the most significant
+ bits of the operands, and compute the continued fraction for them. We then
+ apply the gathered cofactors to the full operands.
+
+ Idea 1: After we have performed a full division, don't shift operands back,
+ but instead account for the extra factors-of-2 thus introduced.
+ Idea 2: Simple generalization to use divide-and-conquer would give us an
+ algorithm that runs faster than O(n^2).
+ Idea 3: The input numbers need less space as the computation progresses,
+ while the s0 and s1 variables need more space. To save memory, we
+ could make them share space, and have the latter variables grow
+ into the former.
+ Idea 4: We should not do double-limb arithmetic from the start. Instead,
+ do things in single-limb arithmetic until the quotients differ,
+ and then switch to double-limb arithmetic. */
+
+
+/* Division optimized for small quotients. If the quotient is more than one limb,
+ store 1 in *qh and return 0. */
+static mp_limb_t
+#if __STDC__
+div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
+#else
+div2 (qh, n1, n0, d1, d0)
+ mp_limb_t *qh;
+ mp_limb_t n1;
+ mp_limb_t n0;
+ mp_limb_t d1;
+ mp_limb_t d0;
+#endif
+{
+ if (d1 == 0)
+ {
+ *qh = 1;
+ return 0;
+ }
+
+ if ((mp_limb_signed_t) n1 < 0)
+ {
+ mp_limb_t q;
+ int cnt;
+ for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
+ {
+ d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+ d0 = d0 << 1;
+ }
+
+ q = 0;
+ while (cnt)
+ {
+ q <<= 1;
+ if (n1 > d1 || (n1 == d1 && n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ q |= 1;
+ }
+ d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+ d1 = d1 >> 1;
+ cnt--;
+ }
+
+ *qh = 0;
+ return q;
+ }
+ else
+ {
+ mp_limb_t q;
+ int cnt;
+ for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
+ {
+ d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+ d0 = d0 << 1;
+ }
+
+ q = 0;
+ while (cnt)
+ {
+ d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+ d1 = d1 >> 1;
+ q <<= 1;
+ if (n1 > d1 || (n1 == d1 && n0 >= d0))
+ {
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
+ q |= 1;
+ }
+ cnt--;
+ }
+
+ *qh = 0;
+ return q;
+ }
+}
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
+ mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
+ mp_ptr gp;
+ mp_ptr s0p;
+ mp_size_t *s0size;
+ mp_ptr up;
+ mp_size_t size;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+ mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+ mp_ptr gp;
+ mp_ptr up;
+ mp_size_t size;
+ mp_ptr vp;
+ mp_size_t vsize;
+#endif
+#endif
+{
+ mp_limb_t A, B, C, D;
+ int cnt;
+ mp_ptr tp, wp;
+#if RECORD
+ mp_limb_t max = 0;
+#endif
+#if EXTEND
+ mp_ptr s1p;
+ mp_ptr orig_s0p = s0p;
+ mp_size_t ssize;
+ int sign = 1;
+#endif
+ int use_double_flag;
+ TMP_DECL (mark);
+
+ TMP_MARK (mark);
+
+ use_double_flag = (size >= GCDEXT_THRESHOLD);
+
+ tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+ wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+#if EXTEND
+ s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+
+ MPN_ZERO (s0p, size);
+ MPN_ZERO (s1p, size);
+
+ s0p[0] = 1;
+ s1p[0] = 0;
+ ssize = 1;
+#endif
+
+ if (size > vsize)
+ {
+ /* Normalize V (and shift up U the same amount). */
+ count_leading_zeros (cnt, vp[vsize - 1]);
+ if (cnt != 0)
+ {
+ mp_limb_t cy;
+ mpn_lshift (vp, vp, vsize, cnt);
+ cy = mpn_lshift (up, up, size, cnt);
+ up[size] = cy;
+ size += cy != 0;
+ }
+
+ mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+ /* This is really what it boils down to in this case... */
+ s0p[0] = 0;
+ s1p[0] = 1;
+ sign = -sign;
+#endif
+ size = vsize;
+ if (cnt != 0)
+ {
+ mpn_rshift (up, up, size, cnt);
+ mpn_rshift (vp, vp, size, cnt);
+ }
+ MP_PTR_SWAP (up, vp);
+ }
+
+ for (;;)
+ {
+ mp_limb_t asign;
+ /* Figure out exact size of V. */
+ vsize = size;
+ MPN_NORMALIZE (vp, vsize);
+ if (vsize <= 1)
+ break;
+
+ if (use_double_flag)
+ {
+ mp_limb_t uh, vh, ul, vl;
+ /* Let UH,UL be the most significant limbs of U, and let VH,VL be
+ the corresponding bits from V. */
+ uh = up[size - 1];
+ vh = vp[size - 1];
+ ul = up[size - 2];
+ vl = vp[size - 2];
+ count_leading_zeros (cnt, uh);
+ if (cnt != 0)
+ {
+ uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
+ vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
+ vl <<= cnt;
+ ul <<= cnt;
+ if (size >= 3)
+ {
+ ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+ vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+ }
+ }
+
+ A = 1;
+ B = 0;
+ C = 0;
+ D = 1;
+
+ asign = 0;
+ for (;;)
+ {
+ mp_limb_t T;
+ mp_limb_t qh, q1, q2;
+ mp_limb_t nh, nl, dh, dl;
+ mp_limb_t t1, t0;
+ mp_limb_t Th, Tl;
+
+ sub_ddmmss (dh, dl, vh, vl, 0, C);
+ if ((dl | dh) == 0)
+ break;
+ add_ssaaaa (nh, nl, uh, ul, 0, A);
+ q1 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ add_ssaaaa (dh, dl, vh, vl, 0, D);
+ if ((dl | dh) == 0)
+ break;
+ sub_ddmmss (nh, nl, uh, ul, 0, B);
+ q2 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ if (q1 != q2)
+ break;
+
+ asign = ~asign;
+
+ T = A + q1 * C;
+ A = C;
+ C = T;
+ T = B + q1 * D;
+ B = D;
+ D = T;
+ umul_ppmm (t1, t0, q1, vl);
+ t1 += q1 * vh;
+ sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+ uh = vh, ul = vl;
+ vh = Th, vl = Tl;
+
+ add_ssaaaa (dh, dl, vh, vl, 0, C);
+ sub_ddmmss (nh, nl, uh, ul, 0, A);
+ q1 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ sub_ddmmss (dh, dl, vh, vl, 0, D);
+ if ((dl | dh) == 0)
+ break;
+ add_ssaaaa (nh, nl, uh, ul, 0, B);
+ q2 = div2 (&qh, nh, nl, dh, dl);
+ if (qh != 0)
+ break; /* could handle this */
+
+ if (q1 != q2)
+ break;
+
+ asign = ~asign;
+
+ T = A + q1 * C;
+ A = C;
+ C = T;
+ T = B + q1 * D;
+ B = D;
+ D = T;
+ umul_ppmm (t1, t0, q1, vl);
+ t1 += q1 * vh;
+ sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+ uh = vh, ul = vl;
+ vh = Th, vl = Tl;
+ }
+#if EXTEND
+ if (asign)
+ sign = -sign;
+#endif
+ }
+ else /* Same, but using single-limb calculations. */
+ {
+ mp_limb_t uh, vh;
+ /* Make UH be the most significant limb of U, and make VH be
+ corresponding bits from V. */
+ uh = up[size - 1];
+ vh = vp[size - 1];
+ count_leading_zeros (cnt, uh);
+ if (cnt != 0)
+ {
+ uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+ vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+ }
+
+ A = 1;
+ B = 0;
+ C = 0;
+ D = 1;
+
+ asign = 0;
+ for (;;)
+ {
+ mp_limb_t q, T;
+ if (vh - C == 0 || vh + D == 0)
+ break;
+
+ q = (uh + A) / (vh - C);
+ if (q != (uh - B) / (vh + D))
+ break;
+
+ asign = ~asign;
+
+ T = A + q * C;
+ A = C;
+ C = T;
+ T = B + q * D;
+ B = D;
+ D = T;
+ T = uh - q * vh;
+ uh = vh;
+ vh = T;
+
+ if (vh - D == 0)
+ break;
+
+ q = (uh - A) / (vh + C);
+ if (q != (uh + B) / (vh - D))
+ break;
+
+ asign = ~asign;
+
+ T = A + q * C;
+ A = C;
+ C = T;
+ T = B + q * D;
+ B = D;
+ D = T;
+ T = uh - q * vh;
+ uh = vh;
+ vh = T;
+ }
+#if EXTEND
+ if (asign)
+ sign = -sign;
+#endif
+ }
+
+#if RECORD
+ max = MAX (A, max); max = MAX (B, max);
+ max = MAX (C, max); max = MAX (D, max);
+#endif
+
+ if (B == 0)
+ {
+ mp_limb_t qh;
+ mp_size_t i;
+ /* This is quite rare. I.e., optimize something else! */
+
+ /* Normalize V (and shift up U the same amount). */
+ count_leading_zeros (cnt, vp[vsize - 1]);
+ if (cnt != 0)
+ {
+ mp_limb_t cy;
+ mpn_lshift (vp, vp, vsize, cnt);
+ cy = mpn_lshift (up, up, size, cnt);
+ up[size] = cy;
+ size += cy != 0;
+ }
+
+ qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+ MPN_COPY (tp, s0p, ssize);
+ {
+ mp_size_t qsize;
+
+ qsize = size - vsize; /* size of stored quotient from division */
+ if (ssize < qsize)
+ {
+ MPN_ZERO (tp + ssize, qsize - ssize);
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < ssize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
+ tp[qsize + i] = cy;
+ }
+ if (qh != 0)
+ {
+ mp_limb_t cy;
+ cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+ if (cy != 0)
+ abort ();
+ }
+ }
+ else
+ {
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < qsize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+ tp[ssize + i] = cy;
+ }
+ if (qh != 0)
+ {
+ mp_limb_t cy;
+ cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+ if (cy != 0)
+ {
+ tp[qsize + ssize] = cy;
+ s1p[qsize + ssize] = 0;
+ ssize++;
+ }
+ }
+ }
+ ssize += qsize;
+ ssize -= tp[ssize - 1] == 0;
+ }
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#endif
+ size = vsize;
+ if (cnt != 0)
+ {
+ mpn_rshift (up, up, size, cnt);
+ mpn_rshift (vp, vp, size, cnt);
+ }
+ MP_PTR_SWAP (up, vp);
+ }
+ else
+ {
+#if EXTEND
+ mp_size_t tsize, wsize;
+#endif
+ /* T = U*A + V*B
+ W = U*C + V*D
+ U = T
+ V = W */
+
+#if STAT
+ { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
+ arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+ if (A == 0)
+ {
+ /* B == 1 and C == 1 (D is arbitrary) */
+ mp_limb_t cy;
+ MPN_COPY (tp, vp, size);
+ MPN_COPY (wp, up, size);
+ mpn_submul_1 (wp, vp, size, D);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ MPN_COPY (tp, s1p, ssize);
+ tsize = ssize;
+ tp[ssize] = 0; /* must zero since wp might spill below */
+ MPN_COPY (wp, s0p, ssize);
+ cy = mpn_addmul_1 (wp, s1p, ssize, D);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ else
+ {
+ if (asign)
+ {
+ mp_limb_t cy;
+ mpn_mul_1 (tp, vp, size, B);
+ mpn_submul_1 (tp, up, size, A);
+ mpn_mul_1 (wp, up, size, C);
+ mpn_submul_1 (wp, vp, size, D);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ cy = mpn_mul_1 (tp, s1p, ssize, B);
+ cy += mpn_addmul_1 (tp, s0p, ssize, A);
+ tp[ssize] = cy;
+ tsize = ssize + (cy != 0);
+ cy = mpn_mul_1 (wp, s0p, ssize, C);
+ cy += mpn_addmul_1 (wp, s1p, ssize, D);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ else
+ {
+ mp_limb_t cy;
+ mpn_mul_1 (tp, up, size, A);
+ mpn_submul_1 (tp, vp, size, B);
+ mpn_mul_1 (wp, vp, size, D);
+ mpn_submul_1 (wp, up, size, C);
+ MP_PTR_SWAP (tp, up);
+ MP_PTR_SWAP (wp, vp);
+#if EXTEND
+ cy = mpn_mul_1 (tp, s0p, ssize, A);
+ cy += mpn_addmul_1 (tp, s1p, ssize, B);
+ tp[ssize] = cy;
+ tsize = ssize + (cy != 0);
+ cy = mpn_mul_1 (wp, s1p, ssize, D);
+ cy += mpn_addmul_1 (wp, s0p, ssize, C);
+ wp[ssize] = cy;
+ wsize = ssize + (cy != 0);
+ MP_PTR_SWAP (tp, s0p);
+ MP_PTR_SWAP (wp, s1p);
+ ssize = MAX (wsize, tsize);
+#endif
+ }
+ }
+
+ size -= up[size - 1] == 0;
+ }
+ }
+
+#if RECORD
+ printf ("max: %lx\n", max);
+#endif
+
+#if STAT
+ {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
+#endif
+
+ if (vsize == 0)
+ {
+ if (gp != up && gp != 0)
+ MPN_COPY (gp, up, size);
+#if EXTEND
+ MPN_NORMALIZE (s0p, ssize);
+ if (orig_s0p != s0p)
+ MPN_COPY (orig_s0p, s0p, ssize);
+ *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+ TMP_FREE (mark);
+ return size;
+ }
+ else
+ {
+ mp_limb_t vl, ul, t;
+#if EXTEND
+ mp_size_t qsize, i;
+#endif
+ vl = vp[0];
+#if EXTEND
+ t = mpn_divmod_1 (wp, up, size, vl);
+
+ MPN_COPY (tp, s0p, ssize);
+
+ qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
+ if (ssize < qsize)
+ {
+ MPN_ZERO (tp + ssize, qsize - ssize);
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < ssize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
+ tp[qsize + i] = cy;
+ }
+ }
+ else
+ {
+ MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+ for (i = 0; i < qsize; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+ tp[ssize + i] = cy;
+ }
+ }
+ ssize += qsize;
+ ssize -= tp[ssize - 1] == 0;
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#else
+ t = mpn_mod_1 (up, size, vl);
+#endif
+ ul = vl;
+ vl = t;
+ while (vl != 0)
+ {
+ mp_limb_t t;
+#if EXTEND
+ mp_limb_t q;
+ q = ul / vl;
+ t = ul - q * vl;
+
+ MPN_COPY (tp, s0p, ssize);
+
+ MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
+
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp, s1p, ssize, q);
+ tp[ssize] = cy;
+ }
+
+ ssize += 1;
+ ssize -= tp[ssize - 1] == 0;
+
+ sign = -sign;
+ MP_PTR_SWAP (s0p, s1p);
+ MP_PTR_SWAP (s1p, tp);
+#else
+ t = ul % vl;
+#endif
+ ul = vl;
+ vl = t;
+ }
+ if (gp != 0)
+ gp[0] = ul;
+#if EXTEND
+ MPN_NORMALIZE (s0p, ssize);
+ if (orig_s0p != s0p)
+ MPN_COPY (orig_s0p, s0p, ssize);
+ *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+ TMP_FREE (mark);
+ return 1;
+ }
+}
diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000000..a713b61825
--- /dev/null
+++ b/rts/gmp/mpn/generic/get_str.c
@@ -0,0 +1,216 @@
+/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR
+ to a printable string in STR in base BASE.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Convert the limb vector pointed to by MPTR and MSIZE long to a
+ char array, using base BASE for the result array. Store the
+ result in the character array STR. STR must point to an array with
+ space for the largest possible number represented by a MSIZE long
+ limb vector + 1 extra character.
+
+ The result is NOT in Ascii, to convert it to printable format, add
+ '0' or 'A' depending on the base and range.
+
+ Return the number of digits in the result string.
+ This may include some leading zeros.
+
+ The limb vector pointed to by MPTR is clobbered. */
+
+size_t
+#if __STDC__
+mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize)
+#else
+mpn_get_str (str, base, mptr, msize)
+ unsigned char *str;
+ int base;
+ mp_ptr mptr;
+ mp_size_t msize;
+#endif
+{
+ mp_limb_t big_base;
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ int normalization_steps;
+#endif
+#if UDIV_TIME > 2 * UMUL_TIME
+ mp_limb_t big_base_inverted;
+#endif
+ unsigned int dig_per_u;
+ mp_size_t out_len;
+ register unsigned char *s;
+
+ big_base = __mp_bases[base].big_base;
+
+ s = str;
+
+ /* Special case zero, as the code below doesn't handle it. */
+ if (msize == 0)
+ {
+ s[0] = 0;
+ return 1;
+ }
+
+ if ((base & (base - 1)) == 0)
+ {
+ /* The base is a power of 2. Make conversion from most
+ significant side. */
+ mp_limb_t n1, n0;
+ register int bits_per_digit = big_base;
+ register int x;
+ register int bit_pos;
+ register int i;
+
+ n1 = mptr[msize - 1];
+ count_leading_zeros (x, n1);
+
+ /* BIT_POS should be R when input ends in least sign. nibble,
+ R + bits_per_digit * n when input ends in n:th least significant
+ nibble. */
+
+ {
+ int bits;
+
+ bits = BITS_PER_MP_LIMB * msize - x;
+ x = bits % bits_per_digit;
+ if (x != 0)
+ bits += bits_per_digit - x;
+ bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB;
+ }
+
+ /* Fast loop for bit output. */
+ i = msize - 1;
+ for (;;)
+ {
+ bit_pos -= bits_per_digit;
+ while (bit_pos >= 0)
+ {
+ *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+ bit_pos -= bits_per_digit;
+ }
+ i--;
+ if (i < 0)
+ break;
+ n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+ n1 = mptr[i];
+ bit_pos += BITS_PER_MP_LIMB;
+ *s++ = n0 | (n1 >> bit_pos);
+ }
+
+ *s = 0;
+
+ return s - str;
+ }
+ else
+ {
+ /* General case. The base is not a power of 2. Make conversion
+ from least significant end. */
+
+ /* If udiv_qrnnd only handles divisors with the most significant bit
+ set, prepare BIG_BASE for being a divisor by shifting it to the
+ left exactly enough to set the most significant bit. */
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ count_leading_zeros (normalization_steps, big_base);
+ big_base <<= normalization_steps;
+#if UDIV_TIME > 2 * UMUL_TIME
+ /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS). */
+ big_base_inverted = __mp_bases[base].big_base_inverted;
+#endif
+#endif
+
+ dig_per_u = __mp_bases[base].chars_per_limb;
+ out_len = ((size_t) msize * BITS_PER_MP_LIMB
+ * __mp_bases[base].chars_per_bit_exactly) + 1;
+ s += out_len;
+
+ while (msize != 0)
+ {
+ int i;
+ mp_limb_t n0, n1;
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ /* If we shifted BIG_BASE above, shift the dividend too, to get
+ the right quotient. We need to do this every loop,
+ since the intermediate quotients are OK, but the quotient from
+ one turn in the loop is going to be the dividend in the
+ next turn, and the dividend needs to be up-shifted. */
+ if (normalization_steps != 0)
+ {
+ n0 = mpn_lshift (mptr, mptr, msize, normalization_steps);
+
+ /* If the shifting gave a carry out limb, store it and
+ increase the length. */
+ if (n0 != 0)
+ {
+ mptr[msize] = n0;
+ msize++;
+ }
+ }
+#endif
+
+ /* Divide the number at TP with BIG_BASE to get a quotient and a
+ remainder. The remainder is our new digit in base BIG_BASE. */
+ i = msize - 1;
+ n1 = mptr[i];
+
+ if (n1 >= big_base)
+ n1 = 0;
+ else
+ {
+ msize--;
+ i--;
+ }
+
+ for (; i >= 0; i--)
+ {
+ n0 = mptr[i];
+#if UDIV_TIME > 2 * UMUL_TIME
+ udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted);
+#else
+ udiv_qrnnd (mptr[i], n1, n1, n0, big_base);
+#endif
+ }
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+ /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests)
+ the remainder will be up-shifted here. Compensate. */
+ n1 >>= normalization_steps;
+#endif
+
+ /* Convert N1 from BIG_BASE to a string of digits in BASE
+ using single precision operations. */
+ for (i = dig_per_u - 1; i >= 0; i--)
+ {
+ *--s = n1 % base;
+ n1 /= base;
+ if (n1 == 0 && msize == 0)
+ break;
+ }
+ }
+
+ while (s != str)
+ *--s = 0;
+ return out_len;
+ }
+}
diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000000..14bcaece83
--- /dev/null
+++ b/rts/gmp/mpn/generic/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c
new file mode 100644
index 0000000000..35c10e8450
--- /dev/null
+++ b/rts/gmp/mpn/generic/hamdist.c
@@ -0,0 +1,94 @@
+/* mpn_hamdist --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+ implement the POPC instruction. Disable pattern for now. */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+ ({ \
+ DItype __res; \
+ asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \
+ __res; \
+ })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+ You have to figure out how this works, I won't tell you! */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+ mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+ /* We have to go into some trouble to define these constants.
+ (For mp_limb_t being `long long'.) */
+ mp_limb_t cnst;
+ cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+ x -= (x & cnst) >> 1;
+ cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+ x = ((x & ~cnst) >> 2) + (x & cnst);
+ cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+ x = ((x >> 4) + x) & cnst;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x);
+ x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+ x -= (x & 0xaaaaaaaa) >> 1;
+ x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+ x = ((x >> 4) + x) & 0x0f0f0f0fL;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x) & 0xff;
+#endif
+ return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size)
+#else
+mpn_hamdist (up, vp, size)
+ register mp_srcptr up;
+ register mp_srcptr vp;
+ register mp_size_t size;
+#endif
+{
+ unsigned long int hamdist;
+ mp_size_t i;
+
+ hamdist = 0;
+ for (i = 0; i < size; i++)
+ hamdist += popc_limb (up[i] ^ vp[i]);
+
+ return hamdist;
+}
diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c
new file mode 100644
index 0000000000..9487e58cf2
--- /dev/null
+++ b/rts/gmp/mpn/generic/inlines.c
@@ -0,0 +1,24 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#define _FORCE_INLINES
+#define _EXTERN_INLINE /* empty */
+#include "gmp.h"
diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000000..dd437f1ac1
--- /dev/null
+++ b/rts/gmp/mpn/generic/jacbase.c
@@ -0,0 +1,136 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+ THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+ INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if COUNT_TRAILING_ZEROS_TIME <= 7
+/* If count_trailing_zeros is fast, use it.
+ K7 at 7 cycles and P6 at 2 are good here. K6 at 12-27 and P5 at 18-42
+ are not. The default 15 in longlong.h is meant to mean not good here. */
+
+#define PROCESS_TWOS_ANY \
+ { \
+ mp_limb_t twos; \
+ count_trailing_zeros (twos, a); \
+ result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \
+ a >>= twos; \
+ }
+
+#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY
+
+#else
+/* Use a loop instead. With "a" uniformly distributed there will usually be
+ only a few trailing zeros.
+
+ Unfortunately the branch for the while loop here will be on a 50/50
+ chance of a 1 or 0, which is bad for branch prediction. */
+
+#define PROCESS_TWOS_EVEN \
+ { \
+ int two; \
+ two = JACOBI_TWO_U_BIT1 (b); \
+ do \
+ { \
+ a >>= 1; \
+ result_bit1 ^= two; \
+ ASSERT (a != 0); \
+ } \
+ while ((a & 1) == 0); \
+ }
+
+#define PROCESS_TWOS_ANY \
+ if ((a & 1) == 0) \
+ PROCESS_TWOS_EVEN;
+
+#endif
+
+
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+ with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+
+ The initial result_bit1 is taken as a parameter for the convenience of
+ mpz_kronecker_zi_ui() et al. The sign changes both here and in those
+ routines accumulate nicely in bit 1, see the JACOBI macros.
+
+ The return value here is the normal +1, 0, or -1. Note that +1 and -1
+ have bit 1 in the "BIT1" sense, which could be useful if the caller is
+ accumulating it into some extended calculation.
+
+ Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+ possible, but a couple of tests suggest it's not a significant speedup,
+ and may even be a slowdown, so what's here is good enough for now.
+
+ Future: The code doesn't demand a<=b actually, so maybe this could be
+ relaxed. All the places this is used currently call with a<=b though. */
+
+int
+#if __STDC__
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+#else
+mpn_jacobi_base (a, b, result_bit1)
+ mp_limb_t a;
+ mp_limb_t b;
+ int result_bit1;
+#endif
+{
+ ASSERT (b & 1); /* b odd */
+ ASSERT (b != 1);
+ ASSERT (a <= b);
+
+ if (a == 0)
+ return 0;
+
+ PROCESS_TWOS_ANY;
+ if (a == 1)
+ goto done;
+
+ for (;;)
+ {
+ result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+ MP_LIMB_T_SWAP (a, b);
+
+ do
+ {
+ /* working on (a/b), a,b odd, a>=b */
+ ASSERT (a & 1);
+ ASSERT (b & 1);
+ ASSERT (a >= b);
+
+ if ((a -= b) == 0)
+ return 0;
+
+ PROCESS_TWOS_EVEN;
+ if (a == 1)
+ goto done;
+ }
+ while (a >= b);
+ }
+
+ done:
+ return JACOBI_BIT1_TO_PN (result_bit1);
+}
diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000000..0b58389658
--- /dev/null
+++ b/rts/gmp/mpn/generic/lshift.c
@@ -0,0 +1,87 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+ and store the USIZE least significant digits of the result at WP.
+ Return the bits shifted out from the most significant digit.
+
+ Argument constraints:
+ 1. 0 < CNT < BITS_PER_MP_LIMB
+ 2. If the result is to be written over the input, WP must be >= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_lshift (register mp_ptr wp,
+ register mp_srcptr up, mp_size_t usize,
+ register unsigned int cnt)
+#else
+mpn_lshift (wp, up, usize, cnt)
+ register mp_ptr wp;
+ register mp_srcptr up;
+ mp_size_t usize;
+ register unsigned int cnt;
+#endif
+{
+ register mp_limb_t high_limb, low_limb;
+ register unsigned sh_1, sh_2;
+ register mp_size_t i;
+ mp_limb_t retval;
+
+#ifdef DEBUG
+ if (usize == 0 || cnt == 0)
+ abort ();
+#endif
+
+ sh_1 = cnt;
+#if 0
+ if (sh_1 == 0)
+ {
+ if (wp != up)
+ {
+ /* Copy from high end to low end, to allow specified input/output
+ overlapping. */
+ for (i = usize - 1; i >= 0; i--)
+ wp[i] = up[i];
+ }
+ return 0;
+ }
+#endif
+
+ wp += 1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
+ i = usize - 1;
+ low_limb = up[i];
+ retval = low_limb >> sh_2;
+ high_limb = low_limb;
+ while (--i >= 0)
+ {
+ low_limb = up[i];
+ wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+ high_limb = low_limb;
+ }
+ wp[i] = high_limb << sh_1;
+
+ return retval;
+}
diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000000..168ec9df49
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1.c
@@ -0,0 +1,175 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ Return the single-limb remainder.
+ There are no constraints on the value of the divisor.
+
+Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb)
+#else
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n1, n0, r;
+ int dummy;
+
+ /* Botch: Should this be handled at all? Rely on callers? */
+ if (dividend_size == 0)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ dividend is large, pre-invert the divisor, and use
+ only multiplications in the inner loop. */
+
+ /* This test should be read:
+ Does it ever help to use udiv_qrnnd_preinv?
+ && Does what we save compensate for the inversion overhead? */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ udiv_qrnnd_preinv (dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ }
+ else
+ {
+ mp_limb_t divisor_limb_inverted;
+
+ invert_limb (divisor_limb_inverted, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ }
+ else
+ {
+ if (UDIV_NEEDS_NORMALIZATION)
+ {
+ int normalization_steps;
+
+ count_leading_zeros (normalization_steps, divisor_limb);
+ if (normalization_steps != 0)
+ {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ if (r == 0
+ && divisor_limb > ((n1 << normalization_steps)
+ | (dividend_ptr[dividend_size - 2] >> ...)))
+ ...one division less... */
+
+ for (i = dividend_size - 2; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd (dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ it, or because DIVISOR_LIMB is already normalized. */
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd (dummy, r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c
new file mode 100644
index 0000000000..62aaa94b92
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1_rs.c
@@ -0,0 +1,111 @@
+/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift.
+
+ THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE
+ INTERFACE. IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.
+ IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+ RELEASE. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be
+ changed to 1 temporarily to test the code under that case too. */
+#if 0
+#undef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+
+
+/* Calculate the remainder "(ptr,size >> shift) % divisor". Note ptr,size
+ is unchanged, the shift is only for its effect on the remainder.
+ The shift doesn't even need to be considered until the last limb.
+
+ This function has the normal size!=0 restriction, unlike the basic
+ mpn_mod_1. */
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift,
+ mp_limb_t divisor)
+#else
+mpn_mod_1_rshift (ptr, size, shift, divisor)
+ mp_srcptr ptr;
+ mp_size_t size;
+ unsigned shift;
+ mp_limb_t divisor;
+#endif
+{
+ mp_limb_t quot, rem;
+
+ ASSERT (shift >= 1);
+ ASSERT (shift < BITS_PER_MP_LIMB);
+ ASSERT (size >= 1);
+
+ if (size == 1)
+ return (ptr[0] >> shift) % divisor;
+
+#if UDIV_NEEDS_NORMALIZATION
+ {
+ int norm;
+ int delta;
+
+ count_leading_zeros (norm, divisor);
+ divisor <<= norm;
+
+ delta = shift - norm;
+ if (delta == 0)
+ return mpn_mod_1 (ptr, size, divisor) >> norm;
+
+ if (delta > 0)
+ {
+ rem = mpn_mod_1 (ptr+1, size-1, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> delta,
+ (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta),
+ divisor);
+ return rem >> norm;
+ }
+ else
+ {
+ rem = mpn_mod_1 (ptr, size, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> (BITS_PER_MP_LIMB+delta),
+ rem << -delta,
+ divisor);
+ return rem >> norm;
+ }
+ }
+
+#else /* !UDIV_NEEDS_NORMALIZATION */
+
+ rem = mpn_mod_1 (ptr+1, size-1, divisor);
+ udiv_qrnnd (quot, rem,
+ rem >> shift,
+ (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift),
+ divisor);
+ return rem;
+
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..cecfa19ca1
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul.c
@@ -0,0 +1,190 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+ THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul)
+ ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
+ THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
+ THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+ (pointed to by VP, with VN limbs), and store the result at PRODP. The
+ result is UN + VN limbs. Return the most significant limb of the result.
+
+ NOTE: The space pointed to by PRODP is overwritten before finished with U
+ and V, so overlap is an error.
+
+ Argument constraints:
+ 1. UN >= VN.
+ 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+ the multiplier and the multiplicand. */
+
+void
+#if __STDC__
+mpn_sqr_n (mp_ptr prodp,
+ mp_srcptr up, mp_size_t un)
+#else
+mpn_sqr_n (prodp, up, un)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t un;
+#endif
+{
+ if (un < KARATSUBA_SQR_THRESHOLD)
+ { /* plain schoolbook multiplication */
+ if (un == 0)
+ return;
+ mpn_sqr_basecase (prodp, up, un);
+ }
+ else if (un < TOOM3_SQR_THRESHOLD)
+ { /* karatsuba multiplication */
+ mp_ptr tspace;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+ mpn_kara_sqr_n (prodp, up, un, tspace);
+ TMP_FREE (marker);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (un < FFT_SQR_THRESHOLD)
+#else
+ else
+#endif
+ { /* toom3 multiplication */
+ mp_ptr tspace;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+ mpn_toom3_sqr_n (prodp, up, un, tspace);
+ TMP_FREE (marker);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else
+ {
+ /* schoenhage multiplication */
+ mpn_mul_fft_full (prodp, up, un, up, un);
+ }
+#endif
+}
+
+mp_limb_t
+#if __STDC__
+mpn_mul (mp_ptr prodp,
+ mp_srcptr up, mp_size_t un,
+ mp_srcptr vp, mp_size_t vn)
+#else
+mpn_mul (prodp, up, un, vp, vn)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t un;
+ mp_srcptr vp;
+ mp_size_t vn;
+#endif
+{
+ mp_size_t l;
+ mp_limb_t c;
+
+ if (up == vp && un == vn)
+ {
+ mpn_sqr_n (prodp, up, un);
+ return prodp[2 * un - 1];
+ }
+
+ if (vn < KARATSUBA_MUL_THRESHOLD)
+ { /* long multiplication */
+ mpn_mul_basecase (prodp, up, un, vp, vn);
+ return prodp[un + vn - 1];
+ }
+
+ mpn_mul_n (prodp, up, vp, vn);
+ if (un != vn)
+ { mp_limb_t t;
+ mp_ptr ws;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ prodp += vn;
+ l = vn;
+ up += vn;
+ un -= vn;
+
+ if (un < vn)
+ {
+ /* Swap u's and v's. */
+ MPN_SRCPTR_SWAP (up,un, vp,vn);
+ }
+
+ ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
+ * BYTES_PER_MP_LIMB);
+
+ t = 0;
+ while (vn >= KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_n (ws, up, vp, vn);
+ if (l <= 2*vn)
+ {
+ t += mpn_add_n (prodp, prodp, ws, l);
+ if (l != 2*vn)
+ {
+ t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
+ l = 2*vn;
+ }
+ }
+ else
+ {
+ c = mpn_add_n (prodp, prodp, ws, 2*vn);
+ t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
+ }
+ prodp += vn;
+ l -= vn;
+ up += vn;
+ un -= vn;
+ if (un < vn)
+ {
+ /* Swap u's and v's. */
+ MPN_SRCPTR_SWAP (up,un, vp,vn);
+ }
+ }
+
+ if (vn)
+ {
+ mpn_mul_basecase (ws, up, un, vp, vn);
+ if (l <= un + vn)
+ {
+ t += mpn_add_n (prodp, prodp, ws, l);
+ if (l != un + vn)
+ t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
+ }
+ else
+ {
+ c = mpn_add_n (prodp, prodp, ws, un + vn);
+ t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
+ }
+ }
+
+ TMP_FREE (marker);
+ }
+ return prodp[un + vn - 1];
+}
diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000000..1c36b5fb1f
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_1.c
@@ -0,0 +1,59 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
+ store the product in a second limb vector.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+
+ /* The loop counter and index J goes from -S1_SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ res_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000000..00c06aa5c4
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_basecase.c
@@ -0,0 +1,87 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+ of length m and n.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Handle simple cases with traditional multiplication.
+
+ This is the most critical code of multiplication. All multiplies rely on
+ this, both small and huge. Small ones arrive here immediately, huge ones
+ arrive here as this is the base case for Karatsuba's recursive algorithm. */
+
+void
+#if __STDC__
+mpn_mul_basecase (mp_ptr prodp,
+ mp_srcptr up, mp_size_t usize,
+ mp_srcptr vp, mp_size_t vsize)
+#else
+mpn_mul_basecase (prodp, up, usize, vp, vsize)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t usize;
+ mp_srcptr vp;
+ mp_size_t vsize;
+#endif
+{
+ /* We first multiply by the low order one or two limbs, as the result can
+ be stored, not added, to PROD. We also avoid a loop for zeroing this
+ way. */
+#if HAVE_NATIVE_mpn_mul_2
+ if (vsize >= 2)
+ {
+ prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
+ prodp += 2, vp += 2, vsize -= 2;
+ }
+ else
+ {
+ prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+ return;
+ }
+#else
+ prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+ prodp += 1, vp += 1, vsize -= 1;
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+ while (vsize >= 2)
+ {
+ prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
+ prodp += 2, vp += 2, vsize -= 2;
+ }
+ if (vsize != 0)
+ prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+#else
+ /* For each iteration in the loop, multiply U with one limb from V, and
+ add the result to PROD. */
+ while (vsize != 0)
+ {
+ prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+ prodp += 1, vp += 1, vsize -= 1;
+ }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000000..00fd6d72de
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_fft.c
@@ -0,0 +1,772 @@
+/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
+ modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
+
+ THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
+ MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+ INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
+ A FUTURE GNU MP RELEASE.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+/* References:
+
+ Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
+ Strassen, Computing 7, p. 281-292, 1971.
+
+ Asymptotically fast algorithms for the numerical multiplication
+ and division of polynomials with complex coefficients, by Arnold Scho"nhage,
+ Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+ Tapes versus Pointers, a study in implementing fast algorithms,
+ by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+ See also http://www.loria.fr/~zimmerma/bignum
+
+
+ Future:
+
+ K==2 isn't needed in the current uses of this code and the bits specific
+ for that could be dropped.
+
+ It might be possible to avoid a small number of MPN_COPYs by using a
+ rotating temporary or two.
+
+ Multiplications of unequal sized operands can be done with this code, but
+ it needs a tighter test for identifying squaring (same sizes as well as
+ same pointers). */
+
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
+ FFT_MUL_TABLE,
+ FFT_SQR_TABLE
+};
+
+
+static void mpn_mul_fft_internal
+_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+ int k, int K,
+ mp_limb_t **Ap, mp_limb_t **Bp,
+ mp_limb_t *A, mp_limb_t *B,
+ mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
+ mp_limb_t *T, int rec));
+
+
+/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
+ sqr==0 if for a multiply, sqr==1 for a square */
+int
+#if __STDC__
+mpn_fft_best_k (mp_size_t n, int sqr)
+#else
+mpn_fft_best_k (n, sqr)
+ mp_size_t n;
+ int sqr;
+#endif
+{
+ mp_size_t t;
+ int i;
+
+ for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+ if (n < mpn_fft_table[sqr][i])
+ return i + FFT_FIRST_K;
+
+ /* treat 4*last as one further entry */
+ if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
+ return i + FFT_FIRST_K;
+ else
+ return i + FFT_FIRST_K + 1;
+}
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
+ FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */
+
+mp_size_t
+#if __STDC__
+mpn_fft_next_size (mp_size_t pl, int k)
+#else
+mpn_fft_next_size (pl, k)
+ mp_size_t pl;
+ int k;
+#endif
+{
+ mp_size_t N, M;
+ int K;
+
+ /* if (k==0) k = mpn_fft_best_k (pl, sqr); */
+ N = pl*BITS_PER_MP_LIMB;
+ K = 1<<k;
+ if (N%K) N=(N/K+1)*K;
+ M = N/K;
+ if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
+ return (N/BITS_PER_MP_LIMB);
+}
+
+
+static void
+#if __STDC__
+mpn_fft_initl(int **l, int k)
+#else
+mpn_fft_initl(l, k)
+ int **l;
+ int k;
+#endif
+{
+ int i,j,K;
+
+ l[0][0] = 0;
+ for (i=1,K=2;i<=k;i++,K*=2) {
+ for (j=0;j<K/2;j++) {
+ l[i][j] = 2*l[i-1][j];
+ l[i][K/2+j] = 1+l[i][j];
+ }
+ }
+}
+
+
+/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
+#else
+mpn_fft_neg_modF(ap, n)
+ mp_limb_t *ap;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t c;
+
+ c = ap[n]+2;
+ mpn_com_n (ap, ap, n);
+ ap[n]=0; mpn_incr_u(ap, c);
+}
+
+
+/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_mul_2exp_modF(ap, e, n, tp)
+ mp_limb_t *ap;
+ int e;
+ mp_size_t n;
+ mp_limb_t *tp;
+#endif
+{
+ int d, sh, i; mp_limb_t cc;
+
+ d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
+ sh = d % BITS_PER_MP_LIMB;
+ if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
+ else MPN_COPY(tp, ap, n+1);
+ d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
+ if (d) {
+ /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
+ /* mpn_xor would be more efficient here */
+ for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
+ cc = 1-mpn_add_1(ap, ap, d, 1);
+ if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
+ else MPN_COPY(ap+d, tp, n-d);
+ if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
+ ap[n]=mpn_add_1(ap, ap, n, cc);
+ else ap[n]=0;
+ }
+ else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
+ ap[n]=mpn_add_1(ap, ap, n, 1);
+ }
+ if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
+}
+
+
+/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
+#else
+mpn_fft_add_modF (ap, bp, n)
+ mp_limb_t *ap,*bp;
+ int n;
+#endif
+{
+ mp_limb_t c;
+
+ c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
+ if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
+ ap[n]=c;
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+ N=n*BITS_PER_MP_LIMB
+ 2^omega is a primitive root mod 2^N+1
+ output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
+ mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
+mp_limb_t **Ap,*tp;
+mp_size_t K,omega,n,inc;
+int **ll;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+ if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+ Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+ }
+ else {
+ int j, inc2=2*inc;
+ int *lk = *ll;
+ mp_limb_t *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
+ mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
+ /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+ A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+ for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
+ MPN_COPY(tp, Ap[inc], n+1);
+ mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[inc], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+ N=n*BITS_PER_MP_LIMB
+ 2^omega is a primitive root mod 2^N+1
+ output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
+ mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
+ mp_limb_t **Ap,**Bp,*tp;
+ mp_size_t K,omega,n,inc;
+ int **ll;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+ if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+ Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+#ifdef ADDSUB
+ if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
+#else
+ MPN_COPY(tp, Bp[0], n+1);
+ mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
+ if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
+#endif
+ Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
+ }
+ else {
+ int j, inc2=2*inc;
+ int *lk=*ll;
+ mp_limb_t *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
+ mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
+ /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+ A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+ for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
+ MPN_COPY(tp, Ap[inc], n+1);
+ mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[inc], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ MPN_COPY(tp, Bp[inc], n+1);
+ mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
+ mpn_fft_add_modF(Bp[inc], Bp[0], n);
+ mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+ mpn_fft_add_modF(Bp[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
+static void
+#if __STDC__
+mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K)
+#else
+mpn_fft_mul_modF_K(ap, bp, n, K)
+ mp_limb_t **ap, **bp;
+ mp_size_t n;
+ int K;
+#endif
+{
+ int i;
+ int sqr = (ap == bp);
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+
+ if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+ int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
+ int **_fft_l;
+ mp_limb_t **Ap,**Bp,*A,*B,*T;
+
+ k = mpn_fft_best_k (n, sqr);
+ K2 = 1<<k;
+ maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
+ M2 = n*BITS_PER_MP_LIMB/K2;
+ l = n/K2;
+ Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+ nprime2 = Nprime2/BITS_PER_MP_LIMB;
+ Mp2 = Nprime2/K2;
+
+ Ap = TMP_ALLOC_MP_PTRS (K2);
+ Bp = TMP_ALLOC_MP_PTRS (K2);
+ A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
+ T = TMP_ALLOC_LIMBS (nprime2+1);
+ B = A + K2*(nprime2+1);
+ _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+ for (i=0;i<=k;i++)
+ _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl(_fft_l, k);
+
+ TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
+ n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+
+ for (i=0;i<K;i++,ap++,bp++)
+ mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+ l, Mp2, _fft_l, T, 1);
+ }
+ else {
+ mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
+ tp = TMP_ALLOC_LIMBS (n2);
+ tpn = tp+n;
+ TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n));
+ for (i=0;i<K;i++) {
+ a = *ap++; b=*bp++;
+ if (sqr)
+ mpn_sqr_n(tp, a, n);
+ else
+ mpn_mul_n(tp, b, a, n);
+ if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
+ if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
+ if (cc) {
+ cc = mpn_add_1(tp, tp, n2, cc);
+ ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
+ }
+ a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1);
+ }
+ }
+ TMP_FREE(marker);
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+ output: K*A[0] K*A[K-1] ... K*A[1] */
+
+static void
+#if __STDC__
+mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
+ mp_limb_t *tp)
+#else
+mpn_fft_fftinv(Ap,K,omega,n,tp)
+ mp_limb_t **Ap, *tp;
+ int K;
+ mp_size_t omega, n;
+#endif
+{
+ if (K==2) {
+#ifdef ADDSUB
+ if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
+#else
+ MPN_COPY(tp, Ap[0], n+1);
+ mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
+ if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
+#endif
+ Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
+ }
+ else {
+ int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp;
+ TMP_DECL(marker);
+
+ TMP_MARK(marker);
+ tmp = TMP_ALLOC_LIMBS (n+1);
+ mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
+ mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
+ /* A[j] <- A[j] + omega^j A[j+K/2]
+ A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+ for (j=0;j<K2;j++,Ap++,Bp++) {
+ MPN_COPY(tp, Bp[0], n+1);
+ mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
+ mpn_fft_add_modF(Bp[0], Ap[0], n);
+ mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
+ mpn_fft_add_modF(Ap[0], tp, n);
+ }
+ TMP_FREE(marker);
+ }
+}
+
+
+/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_div_2exp_modF(ap,k,n,tp)
+ mp_limb_t *ap,*tp;
+ int k;
+ mp_size_t n;
+#endif
+{
+ int i;
+
+ i = 2*n*BITS_PER_MP_LIMB;
+ i = (i-k) % i;
+ mpn_fft_mul_2exp_modF(ap,i,n,tp);
+ /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
+ /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
+ if (ap[n]==1) {
+ for (i=0;i<n && ap[i]==0;i++);
+ if (i<n) {
+ ap[n]=0;
+ mpn_sub_1(ap, ap, n, 1);
+ }
+ }
+}
+
+
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+static void
+#if __STDC__
+mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an)
+#else
+mpn_fft_norm_modF(rp, ap, n, an)
+ mp_limb_t *rp;
+ mp_limb_t *ap;
+ mp_size_t n;
+ mp_size_t an;
+#endif
+{
+ mp_size_t l;
+
+ if (an>2*n) {
+ l = n;
+ rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an,
+ mpn_add_n(rp,ap,ap+2*n,an-2*n));
+ }
+ else {
+ l = an-n;
+ MPN_COPY(rp, ap, n);
+ rp[n]=0;
+ }
+ if (mpn_sub_n(rp,rp,ap+n,l)) {
+ if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
+ rp[n]=mpn_add_1(rp,rp,n,1);
+ }
+}
+
+
+static void
+#if __STDC__
+mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+ int k, int K,
+ mp_limb_t **Ap, mp_limb_t **Bp,
+ mp_limb_t *A, mp_limb_t *B,
+ mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+ int **_fft_l,
+ mp_limb_t *T, int rec)
+#else
+mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
+ mp_limb_t *op;
+ mp_srcptr n, m;
+ mp_limb_t **Ap,**Bp,*A,*B,*T;
+ mp_size_t pl,nprime;
+ int **_fft_l;
+ int k,K,l,Mp,rec;
+#endif
+{
+ int i, sqr, pla, lo, sh, j;
+ mp_limb_t *p;
+
+ sqr = (n==m);
+
+ TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
+ pl,k,K,nprime,l,Mp,rec,sqr));
+
+ /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+ if (rec) for (i=0;i<K;i++) {
+ Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+ /* store the next M bits of n into A[i] */
+ /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+ MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
+ /* set most significant bits of n and m (important in recursive calls) */
+ if (i==K-1) Ap[i][l]=n[0];
+ mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+ if (!sqr) {
+ MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
+ if (i==K-1) Bp[i][l]=m[0];
+ mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+ }
+ }
+
+ /* direct fft's */
+ if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
+ else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
+
+ /* term to term multiplications */
+ mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
+
+ /* inverse fft's */
+ mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
+
+ /* division of terms after inverse fft */
+ for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
+
+ /* addition of terms in result p */
+ MPN_ZERO(T,nprime+1);
+ pla = l*(K-1)+nprime+1; /* number of required limbs for p */
+ p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
+ MPN_ZERO(p, pla);
+ sqr=0; /* will accumulate the (signed) carry at p[pla] */
+ for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
+ mp_ptr n = p+sh;
+ j = (K-i)%K;
+ if (mpn_add_n(n,n,Ap[j],nprime+1))
+ sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
+ T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
+ if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
+ sqr -= mpn_sub_1(n,n,pla-sh,1);
+ sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
+ }
+ }
+ if (sqr==-1) {
+ if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
+ /* p[pla-pl]...p[pla-1] are all zero */
+ mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
+ mpn_sub_1(p+pla-1,p+pla-1,1,1);
+ }
+ }
+ else if (sqr==1) {
+ if (pla>=2*pl)
+ while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
+ else {
+ sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
+ ASSERT (sqr == 0);
+ }
+ }
+ else
+ ASSERT (sqr == 0);
+
+ /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+ < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+ < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+ mpn_fft_norm_modF(op,p,pl,pla);
+}
+
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
+ n and m have respectively nl and ml limbs
+ op must have space for pl+1 limbs
+ One must have pl = mpn_fft_next_size(pl, k).
+*/
+
+void
+#if __STDC__
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+ mp_srcptr n, mp_size_t nl,
+ mp_srcptr m, mp_size_t ml,
+ int k)
+#else
+mpn_mul_fft (op, pl, n, nl, m, ml, k)
+ mp_ptr op;
+ mp_size_t pl;
+ mp_srcptr n;
+ mp_size_t nl;
+ mp_srcptr m;
+ mp_size_t ml;
+ int k;
+#endif
+{
+ int K,maxLK,i,j;
+ mp_size_t N,Nprime,nprime,M,Mp,l;
+ mp_limb_t **Ap,**Bp,*A,*T,*B;
+ int **_fft_l;
+ int sqr = (n==m && nl==ml);
+ TMP_DECL(marker);
+
+ TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
+ pl, nl, ml, k));
+ ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
+
+ TMP_MARK(marker);
+ N = pl*BITS_PER_MP_LIMB;
+ _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+ for (i=0;i<=k;i++)
+ _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl(_fft_l, k);
+ K = 1<<k;
+ M = N/K; /* N = 2^k M */
+ l = M/BITS_PER_MP_LIMB;
+ maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
+
+ Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
+ nprime = Nprime/BITS_PER_MP_LIMB;
+ TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
+ N, K, M, l, maxLK, Nprime, nprime));
+ if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+ maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
+ if (Nprime % maxLK) {
+ Nprime=((Nprime/maxLK)+1)*maxLK;
+ nprime = Nprime/BITS_PER_MP_LIMB;
+ }
+ TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+ }
+
+ T = TMP_ALLOC_LIMBS (nprime+1);
+ Mp = Nprime/K;
+
+ TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
+ pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
+ printf(" temp space %ld\n", 2*K*(nprime+1)));
+
+ A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
+ B = A+K*(nprime+1);
+ Ap = TMP_ALLOC_MP_PTRS (K);
+ Bp = TMP_ALLOC_MP_PTRS (K);
+ /* special decomposition for main call */
+ for (i=0;i<K;i++) {
+ Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+ /* store the next M bits of n into A[i] */
+ /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+ if (nl>0) {
+ j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
+ MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
+ mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+ }
+ else MPN_ZERO(Ap[i], nprime+1);
+ nl -= l;
+ if (n!=m) {
+ if (ml>0) {
+ j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
+ MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
+ mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+ }
+ else MPN_ZERO(Bp[i], nprime+1);
+ }
+ ml -= l;
+ }
+ mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
+ TMP_FREE(marker);
+ _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
+}
+
+
+#if WANT_ASSERT
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+ mpn_zero_p (p, n)
+ mp_ptr p;
+ mp_size_t n;
+#endif
+{
+ mp_size_t i;
+
+ for (i = 0; i < n; i++)
+ {
+ if (p[i] != 0)
+ return 0;
+ }
+
+ return 1;
+}
+#endif
+
+
+/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
+
+ FIXME: Duplicating the result like this is wasteful, do something better
+ perhaps at the norm_modF stage above. */
+
+void
+#if __STDC__
+mpn_mul_fft_full (mp_ptr op,
+ mp_srcptr n, mp_size_t nl,
+ mp_srcptr m, mp_size_t ml)
+#else
+mpn_mul_fft_full (op, n, nl, m, ml)
+ mp_ptr op;
+ mp_srcptr n;
+ mp_size_t nl;
+ mp_srcptr m;
+ mp_size_t ml;
+#endif
+{
+ mp_ptr pad_op;
+ mp_size_t pl;
+ int k;
+ int sqr = (n==m && nl==ml);
+
+ k = mpn_fft_best_k (nl+ml, sqr);
+ pl = mpn_fft_next_size (nl+ml, k);
+
+ TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
+ nl, ml, pl, k));
+
+ pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
+ mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
+
+ ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
+ MPN_COPY (op, pad_op, nl+ml);
+
+ _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
+}
diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000000..b7563be2d3
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_n.c
@@ -0,0 +1,1343 @@
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
+
+ THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
+ ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
+ THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
+ THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+ 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+#if !defined (__alpha) && !defined (__mips)
+/* For all other machines, we want to call mpn functions for the compund
+ operations instead of open-coding them. */
+#define USE_MORE_MPN
+#endif
+
+/*== Function declarations =================================================*/
+
+static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_srcptr, mp_srcptr, mp_srcptr,
+ mp_size_t, mp_size_t));
+static void interpolate3 _PROTO ((mp_srcptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_srcptr,
+ mp_ptr, mp_ptr, mp_ptr,
+ mp_size_t, mp_size_t));
+static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
+
+/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+#if __STDC__
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_mul_n(p, a, b, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t i, sign, w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if (n & 1)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ sign = 0;
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ w = b[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p + n3, b, b + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n3;
+ y = b;
+ sign ^= 1;
+ }
+ else
+ {
+ x = b;
+ y = b + n3;
+ }
+ mpn_sub_n (p + n3, x, y, n2);
+ }
+ p[n] = w;
+
+ n1 = n + 1;
+ if (n2 < KARATSUBA_MUL_THRESHOLD)
+ {
+ if (n3 < KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n3, p + n3, n3);
+ mpn_mul_basecase (p, a, n3, b, n3);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ }
+ mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+ }
+
+ if (sign)
+ mpn_add_n (ws, p, ws, n1);
+ else
+ mpn_sub_n (ws, p, ws, n1);
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1))
+ {
+ mp_limb_t x = ws[nm1] + 1;
+ ws[nm1] = x;
+ if (x == 0)
+ ++ws[n];
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mp_limb_t x;
+ i = n1 + n3;
+ do
+ {
+ x = p[i] + 1;
+ p[i] = x;
+ ++i;
+ } while (x == 0);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ mp_limb_t t;
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ sign = 0;
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n2;
+ y = b;
+ sign ^= 1;
+ }
+ else
+ {
+ x = b;
+ y = b + n2;
+ }
+ mpn_sub_n (p + n2, x, y, n2);
+
+ /* Pointwise products. */
+ if (n2 < KARATSUBA_MUL_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n2, p + n2, n2);
+ mpn_mul_basecase (p, a, n2, b, n2);
+ mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+ mpn_kara_mul_n (p, a, b, n2, ws + n);
+ mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ if (sign)
+ w = mpn_add_n (ws, p, ws, n);
+ else
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ /* TO DO: could put "if (w) { ... }" here.
+ * Less work but badly predicted branch.
+ * No measurable difference in speed on Alpha.
+ */
+ i = n + n2;
+ t = p[i] + w;
+ p[i] = t;
+ if (t < w)
+ {
+ do
+ {
+ ++i;
+ w = p[i] + 1;
+ p[i] = w;
+ }
+ while (w == 0);
+ }
+ }
+}
+
+void
+#if __STDC__
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_sqr_n (p, a, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t i, sign, w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if (n & 1)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ sign = 0;
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p + n3, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign ^= 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p + n3, x, y, n2);
+ }
+ p[n] = w;
+
+ n1 = n + 1;
+ if (n2 < KARATSUBA_SQR_THRESHOLD)
+ {
+ if (n3 < KARATSUBA_SQR_THRESHOLD)
+ {
+ mpn_sqr_basecase (ws, p, n3);
+ mpn_sqr_basecase (p, a, n3);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n3, ws + n1);
+ mpn_kara_sqr_n (p, a, n3, ws + n1);
+ }
+ mpn_sqr_basecase (p + n1, a + n3, n2);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n3, ws + n1);
+ mpn_kara_sqr_n (p, a, n3, ws + n1);
+ mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+ }
+
+ if (sign)
+ mpn_add_n (ws, p, ws, n1);
+ else
+ mpn_sub_n (ws, p, ws, n1);
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1))
+ {
+ mp_limb_t x = ws[nm1] + 1;
+ ws[nm1] = x;
+ if (x == 0)
+ ++ws[n];
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mp_limb_t x;
+ i = n1 + n3;
+ do
+ {
+ x = p[i] + 1;
+ p[i] = x;
+ ++i;
+ } while (x == 0);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ mp_limb_t t;
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ sign = 0;
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign = 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2+i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign ^= 1;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p + n2, x, y, n2);
+
+ /* Pointwise products. */
+ if (n2 < KARATSUBA_SQR_THRESHOLD)
+ {
+ mpn_sqr_basecase (ws, p, n2);
+ mpn_sqr_basecase (p, a, n2);
+ mpn_sqr_basecase (p + n, a + n2, n2);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n2, ws + n);
+ mpn_kara_sqr_n (p, a, n2, ws + n);
+ mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ if (sign)
+ w = mpn_add_n (ws, p, ws, n);
+ else
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ /* TO DO: could put "if (w) { ... }" here.
+ * Less work but badly predicted branch.
+ * No measurable difference in speed on Alpha.
+ */
+ i = n + n2;
+ t = p[i] + w;
+ p[i] = t;
+ if (t < w)
+ {
+ do
+ {
+ ++i;
+ w = p[i] + 1;
+ p[i] = w;
+ }
+ while (w == 0);
+ }
+ }
+}
+
+/*-- add2Times -------------------------------------------------------------*/
+
+/* z[] = x[] + 2 * y[]
+ Note that z and x might point to the same vectors. */
+#ifdef USE_MORE_MPN
+static inline mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+ mp_ptr z;
+ mp_srcptr x;
+ mp_srcptr y;
+ mp_size_t n;
+#endif
+{
+ mp_ptr t;
+ mp_limb_t c;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+ c = mpn_lshift (t, y, n, 1);
+ c += mpn_add_n (z, x, t, n);
+ TMP_FREE (marker);
+ return c;
+}
+#else
+
+static mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+ mp_ptr z;
+ mp_srcptr x;
+ mp_srcptr y;
+ mp_size_t n;
+#endif
+{
+ mp_limb_t c, v, w;
+
+ ASSERT (n > 0);
+ v = *x; w = *y;
+ c = w >> (BITS_PER_MP_LIMB - 1);
+ w <<= 1;
+ v += w;
+ c += v < w;
+ *z = v;
+ ++x; ++y; ++z;
+ while (--n)
+ {
+ v = *x;
+ w = *y;
+ v += c;
+ c = v < c;
+ c += w >> (BITS_PER_MP_LIMB - 1);
+ w <<= 1;
+ v += w;
+ c += v < w;
+ *z = v;
+ ++x; ++y; ++z;
+ }
+
+ return c;
+}
+#endif
+
+/*-- evaluate3 -------------------------------------------------------------*/
+
+/* Evaluates:
+ * ph := 4*A+2*B+C
+ * p1 := A+B+C
+ * p2 := A+2*B+4*C
+ * where:
+ * ph[], p1[], p2[], A[] and B[] all have length len,
+ * C[] has length len2 with len-len2 = 0, 1 or 2.
+ * Returns top words (overflow) at pth, pt1 and pt2 respectively.
+ */
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+ mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+ A, B, C, len, len2)
+ mp_ptr ph;
+ mp_ptr p1;
+ mp_ptr p2;
+ mp_ptr pth;
+ mp_ptr pt1;
+ mp_ptr pt2;
+ mp_srcptr A;
+ mp_srcptr B;
+ mp_srcptr C;
+ mp_size_t len;
+ mp_size_t len2;
+#endif
+{
+ mp_limb_t c, d, e;
+
+ ASSERT (len - len2 <= 2);
+
+ e = mpn_lshift (p1, B, len, 1);
+
+ c = mpn_lshift (ph, A, len, 2);
+ c += e + mpn_add_n (ph, ph, p1, len);
+ d = mpn_add_n (ph, ph, C, len2);
+ if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+ ASSERT (c < 7);
+ *pth = c;
+
+ c = mpn_lshift (p2, C, len2, 2);
+#if 1
+ if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+ c += e + mpn_add_n (p2, p2, p1, len);
+#else
+ d = mpn_add_n (p2, p2, p1, len2);
+ c += d;
+ if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+ c += e;
+#endif
+ c += mpn_add_n (p2, p2, A, len);
+ ASSERT (c < 7);
+ *pt2 = c;
+
+ c = mpn_add_n (p1, A, B, len);
+ d = mpn_add_n (p1, p1, C, len2);
+ if (len2 == len) c += d;
+ else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+ ASSERT (c < 3);
+ *pt1 = c;
+
+}
+
+#else
+
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+ mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+ A, B, C, l, ls)
+ mp_ptr ph;
+ mp_ptr p1;
+ mp_ptr p2;
+ mp_ptr pth;
+ mp_ptr pt1;
+ mp_ptr pt2;
+ mp_srcptr A;
+ mp_srcptr B;
+ mp_srcptr C;
+ mp_size_t l;
+ mp_size_t ls;
+#endif
+{
+ mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
+
+ ASSERT (l - ls <= 2);
+
+ th = t1 = t2 = 0;
+ for (i = 0; i < l; ++i)
+ {
+ a = *A;
+ b = *B;
+ c = i < ls ? *C : 0;
+
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ t = a << 2;
+ vh = th + t;
+ th = vh < t;
+ th += a >> (BITS_PER_MP_LIMB - 2);
+ t = b << 1;
+ vh += t;
+ th += vh < t;
+ th += b >> (BITS_PER_MP_LIMB - 1);
+ vh += c;
+ th += vh < c;
+#else
+ vh = th + c;
+ th = vh < c;
+ t = b << 1;
+ vh += t;
+ th += vh < t;
+ th += b >> (BITS_PER_MP_LIMB - 1);
+ t = a << 2;
+ vh += t;
+ th += vh < t;
+ th += a >> (BITS_PER_MP_LIMB - 2);
+#endif
+
+ v1 = t1 + a;
+ t1 = v1 < a;
+ v1 += b;
+ t1 += v1 < b;
+ v1 += c;
+ t1 += v1 < c;
+
+ v2 = t2 + a;
+ t2 = v2 < a;
+ t = b << 1;
+ v2 += t;
+ t2 += v2 < t;
+ t2 += b >> (BITS_PER_MP_LIMB - 1);
+ t = c << 2;
+ v2 += t;
+ t2 += v2 < t;
+ t2 += c >> (BITS_PER_MP_LIMB - 2);
+
+ *ph = vh;
+ *p1 = v1;
+ *p2 = v2;
+
+ ++A; ++B; ++C;
+ ++ph; ++p1; ++p2;
+ }
+
+ ASSERT (th < 7);
+ ASSERT (t1 < 3);
+ ASSERT (t2 < 7);
+
+ *pth = th;
+ *pt1 = t1;
+ *pt2 = t2;
+}
+#endif
+
+
+/*-- interpolate3 ----------------------------------------------------------*/
+
+/* Interpolates B, C, D (in-place) from:
+ * 16*A+8*B+4*C+2*D+E
+ * A+B+C+D+E
+ * A+2*B+4*C+8*D+16*E
+ * where:
+ * A[], B[], C[] and D[] all have length l,
+ * E[] has length ls with l-ls = 0, 2 or 4.
+ *
+ * Reads top words (from earlier overflow) from ptb, ptc and ptd,
+ * and returns new top words there.
+ */
+
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+ mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
+#else
+interpolate3 (A, B, C, D, E,
+ ptb, ptc, ptd, len, len2)
+ mp_srcptr A;
+ mp_ptr B;
+ mp_ptr C;
+ mp_ptr D;
+ mp_srcptr E;
+ mp_ptr ptb;
+ mp_ptr ptc;
+ mp_ptr ptd;
+ mp_size_t len;
+ mp_size_t len2;
+#endif
+{
+ mp_ptr ws;
+ mp_limb_t t, tb,tc,td;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+
+ ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
+
+ /* Let x1, x2, x3 be the values to interpolate. We have:
+ * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
+ * c = a + x1 + x2 + x3 + e
+ * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
+ */
+
+ ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
+
+ tb = *ptb; tc = *ptc; td = *ptd;
+
+
+ /* b := b - 16*a - e
+ * c := c - a - e
+ * d := d - a - 16*e
+ */
+
+ t = mpn_lshift (ws, A, len, 4);
+ tb -= t + mpn_sub_n (B, B, ws, len);
+ t = mpn_sub_n (B, B, E, len2);
+ if (len2 == len) tb -= t;
+ else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+
+ tc -= mpn_sub_n (C, C, A, len);
+ t = mpn_sub_n (C, C, E, len2);
+ if (len2 == len) tc -= t;
+ else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+
+ t = mpn_lshift (ws, E, len2, 4);
+ t += mpn_add_n (ws, ws, A, len2);
+#if 1
+ if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+ td -= t + mpn_sub_n (D, D, ws, len);
+#else
+ t += mpn_sub_n (D, D, ws, len2);
+ if (len2 != len) {
+ t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+ t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+ } /* end if/else */
+ td -= t;
+#endif
+
+
+ /* b, d := b + d, b - d */
+
+#ifdef HAVE_MPN_ADD_SUB_N
+ /* #error TO DO ... */
+#else
+ t = tb + td + mpn_add_n (ws, B, D, len);
+ td = tb - td - mpn_sub_n (D, B, D, len);
+ tb = t;
+ MPN_COPY (B, ws, len);
+#endif
+
+ /* b := b-8*c */
+ t = 8 * tc + mpn_lshift (ws, C, len, 3);
+ tb -= t + mpn_sub_n (B, B, ws, len);
+
+ /* c := 2*c - b */
+ tc = 2 * tc + mpn_lshift (C, C, len, 1);
+ tc -= tb + mpn_sub_n (C, C, B, len);
+
+ /* d := d/3 */
+ td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+
+ /* b, d := b + d, b - d */
+#ifdef HAVE_MPN_ADD_SUB_N
+ /* #error TO DO ... */
+#else
+ t = tb + td + mpn_add_n (ws, B, D, len);
+ td = tb - td - mpn_sub_n (D, B, D, len);
+ tb = t;
+ MPN_COPY (B, ws, len);
+#endif
+
+ /* Now:
+ * b = 4*x1
+ * c = 2*x2
+ * d = 4*x3
+ */
+
+ ASSERT(!(*B & 3));
+ mpn_rshift (B, B, len, 2);
+ B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
+ ASSERT((long)tb >= 0);
+ tb >>= 2;
+
+ ASSERT(!(*C & 1));
+ mpn_rshift (C, C, len, 1);
+ C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
+ ASSERT((long)tc >= 0);
+ tc >>= 1;
+
+ ASSERT(!(*D & 3));
+ mpn_rshift (D, D, len, 2);
+ D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
+ ASSERT((long)td >= 0);
+ td >>= 2;
+
+#if WANT_ASSERT
+ ASSERT (tb < 2);
+ if (len == len2)
+ {
+ ASSERT (tc < 3);
+ ASSERT (td < 2);
+ }
+ else
+ {
+ ASSERT (tc < 2);
+ ASSERT (!td);
+ }
+#endif
+
+ *ptb = tb;
+ *ptc = tc;
+ *ptd = td;
+
+ TMP_FREE (marker);
+}
+
+#else
+
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+ mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
+#else
+interpolate3 (A, B, C, D, E,
+ ptb, ptc, ptd, l, ls)
+ mp_srcptr A;
+ mp_ptr B;
+ mp_ptr C;
+ mp_ptr D;
+ mp_srcptr E;
+ mp_ptr ptb;
+ mp_ptr ptc;
+ mp_ptr ptd;
+ mp_size_t l;
+ mp_size_t ls;
+#endif
+{
+ mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
+ const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
+
+#if WANT_ASSERT
+ t = l - ls;
+ ASSERT (t == 0 || t == 2 || t == 4);
+#endif
+
+ sb = sc = sd = 0;
+ for (i = 0; i < l; ++i)
+ {
+ mp_limb_t tb, tc, td, tt;
+
+ a = *A;
+ b = *B;
+ c = *C;
+ d = *D;
+ e = i < ls ? *E : 0;
+
+ /* Let x1, x2, x3 be the values to interpolate. We have:
+ * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
+ * c = a + x1 + x2 + x3 + e
+ * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
+ */
+
+ /* b := b - 16*a - e
+ * c := c - a - e
+ * d := d - a - 16*e
+ */
+ t = a << 4;
+ tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
+ b -= t;
+ tb -= b < e;
+ b -= e;
+ tc = -(c < a);
+ c -= a;
+ tc -= c < e;
+ c -= e;
+ td = -(d < a);
+ d -= a;
+ t = e << 4;
+ td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
+ d -= t;
+
+ /* b, d := b + d, b - d */
+ t = b + d;
+ tt = tb + td + (t < b);
+ td = tb - td - (b < d);
+ d = b - d;
+ b = t;
+ tb = tt;
+
+ /* b := b-8*c */
+ t = c << 3;
+ tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
+ b -= t;
+
+ /* c := 2*c - b */
+ t = c << 1;
+ tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
+ c = t - b;
+
+ /* d := d/3 */
+ d *= INVERSE_3;
+ td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
+ td *= INVERSE_3;
+
+ /* b, d := b + d, b - d */
+ t = b + d;
+ tt = tb + td + (t < b);
+ td = tb - td - (b < d);
+ d = b - d;
+ b = t;
+ tb = tt;
+
+ /* Now:
+ * b = 4*x1
+ * c = 2*x2
+ * d = 4*x3
+ */
+
+ /* sb has period 2. */
+ b += sb;
+ tb += b < sb;
+ sb &= maskOffHalf;
+ sb |= sb >> (BITS_PER_MP_LIMB >> 1);
+ sb += tb;
+
+ /* sc has period 1. */
+ c += sc;
+ tc += c < sc;
+ /* TO DO: choose one of the following alternatives. */
+#if 1
+ sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+ sc += tc;
+#else
+ sc = tc - ((long)sc < 0L);
+#endif
+
+ /* sd has period 2. */
+ d += sd;
+ td += d < sd;
+ sd &= maskOffHalf;
+ sd |= sd >> (BITS_PER_MP_LIMB >> 1);
+ sd += td;
+
+ if (i != 0)
+ {
+ B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+ C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+ D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+ }
+ ob = b >> 2;
+ oc = c >> 1;
+ od = d >> 2;
+
+ ++A; ++B; ++C; ++D; ++E;
+ }
+
+ /* Handle top words. */
+ b = *ptb;
+ c = *ptc;
+ d = *ptd;
+
+ t = b + d;
+ d = b - d;
+ b = t;
+ b -= c << 3;
+ c = (c << 1) - b;
+ d *= INVERSE_3;
+ t = b + d;
+ d = b - d;
+ b = t;
+
+ b += sb;
+ c += sc;
+ d += sd;
+
+ B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+ C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+ D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+
+ b >>= 2;
+ c >>= 1;
+ d >>= 2;
+
+#if WANT_ASSERT
+ ASSERT (b < 2);
+ if (l == ls)
+ {
+ ASSERT (c < 3);
+ ASSERT (d < 2);
+ }
+ else
+ {
+ ASSERT (c < 2);
+ ASSERT (!d);
+ }
+#endif
+
+ *ptb = b;
+ *ptc = c;
+ *ptd = d;
+}
+#endif
+
+
+/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
+
+/* Multiplies using 5 mults of one third size and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
+ * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+ */
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+ do { \
+ if (n < KARATSUBA_MUL_THRESHOLD) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else if (n < TOOM3_MUL_THRESHOLD) \
+ mpn_kara_mul_n (p, a, b, n, ws); \
+ else \
+ mpn_toom3_mul_n (p, a, b, n, ws); \
+ } while (0)
+
+void
+#if __STDC__
+mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_mul_n (p, a, b, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
+ mp_limb_t *A,*B,*C,*D,*E, *W;
+ mp_size_t l,l2,l3,l4,l5,ls;
+
+ /* Break n words into chunks of size l, l and ls.
+ * n = 3*k => l = k, ls = k
+ * n = 3*k+1 => l = k+1, ls = k-1
+ * n = 3*k+2 => l = k+1, ls = k
+ */
+ {
+ mp_limb_t m;
+
+ ASSERT (n >= TOOM3_MUL_THRESHOLD);
+ l = ls = n / 3;
+ m = n - l * 3;
+ if (m != 0)
+ ++l;
+ if (m == 1)
+ --ls;
+
+ l2 = l * 2;
+ l3 = l * 3;
+ l4 = l * 4;
+ l5 = l * 5;
+ A = p;
+ B = ws;
+ C = p + l2;
+ D = ws + l2;
+ E = p + l4;
+ W = ws + l4;
+ }
+
+ /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+ evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
+
+ /** Second stage: pointwise multiplies. **/
+ TOOM3_MUL_REC(D, C, C + l, l, W);
+ tD = cD*dD;
+ if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
+ if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
+ ASSERT (tD < 49);
+ TOOM3_MUL_REC(C, B, B + l, l, W);
+ tC = cC*dC;
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
+ if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
+#else
+ if (cC)
+ {
+ if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
+ else tC += add2Times (C + l, C + l, B + l, l);
+ }
+ if (dC)
+ {
+ if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
+ else tC += add2Times (C + l, C + l, B, l);
+ }
+#endif
+ ASSERT (tC < 9);
+ TOOM3_MUL_REC(B, A, A + l, l, W);
+ tB = cB*dB;
+ if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
+ if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
+ ASSERT (tB < 49);
+ TOOM3_MUL_REC(A, a, b, l, W);
+ TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
+
+ /** Third stage: interpolation. **/
+ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+ /** Final stage: add up the coefficients. **/
+ {
+ mp_limb_t i, x, y;
+ tB += mpn_add_n (p + l, p + l, B, l2);
+ tD += mpn_add_n (p + l3, p + l3, D, l2);
+ mpn_incr_u (p + l3, tB);
+ mpn_incr_u (p + l4, tC);
+ mpn_incr_u (p + l5, tD);
+ }
+}
+
+/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
+
+/* Like previous function but for squaring */
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+ do { \
+ if (n < KARATSUBA_SQR_THRESHOLD) \
+ mpn_sqr_basecase (p, a, n); \
+ else if (n < TOOM3_SQR_THRESHOLD) \
+ mpn_kara_sqr_n (p, a, n, ws); \
+ else \
+ mpn_toom3_sqr_n (p, a, n, ws); \
+ } while (0)
+
+void
+#if __STDC__
+mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_sqr_n (p, a, n, ws)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_size_t n;
+ mp_ptr ws;
+#endif
+{
+ mp_limb_t cB,cC,cD, tB,tC,tD;
+ mp_limb_t *A,*B,*C,*D,*E, *W;
+ mp_size_t l,l2,l3,l4,l5,ls;
+
+ /* Break n words into chunks of size l, l and ls.
+ * n = 3*k => l = k, ls = k
+ * n = 3*k+1 => l = k+1, ls = k-1
+ * n = 3*k+2 => l = k+1, ls = k
+ */
+ {
+ mp_limb_t m;
+
+ ASSERT (n >= TOOM3_MUL_THRESHOLD);
+ l = ls = n / 3;
+ m = n - l * 3;
+ if (m != 0)
+ ++l;
+ if (m == 1)
+ --ls;
+
+ l2 = l * 2;
+ l3 = l * 3;
+ l4 = l * 4;
+ l5 = l * 5;
+ A = p;
+ B = ws;
+ C = p + l2;
+ D = ws + l2;
+ E = p + l4;
+ W = ws + l4;
+ }
+
+ /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+
+ /** Second stage: pointwise multiplies. **/
+ TOOM3_SQR_REC(D, C, l, W);
+ tD = cD*cD;
+ if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
+ ASSERT (tD < 49);
+ TOOM3_SQR_REC(C, B, l, W);
+ tC = cC*cC;
+ /* TO DO: choose one of the following alternatives. */
+#if 0
+ if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
+#else
+ if (cC >= 1)
+ {
+ tC += add2Times (C + l, C + l, B, l);
+ if (cC == 2)
+ tC += add2Times (C + l, C + l, B, l);
+ }
+#endif
+ ASSERT (tC < 9);
+ TOOM3_SQR_REC(B, A, l, W);
+ tB = cB*cB;
+ if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
+ ASSERT (tB < 49);
+ TOOM3_SQR_REC(A, a, l, W);
+ TOOM3_SQR_REC(E, a + l2, ls, W);
+
+ /** Third stage: interpolation. **/
+ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+ /** Final stage: add up the coefficients. **/
+ {
+ mp_limb_t i, x, y;
+ tB += mpn_add_n (p + l, p + l, B, l2);
+ tD += mpn_add_n (p + l3, p + l3, D, l2);
+ mpn_incr_u (p + l3, tB);
+ mpn_incr_u (p + l4, tC);
+ mpn_incr_u (p + l5, tD);
+ }
+}
+
+void
+#if __STDC__
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+#else
+mpn_mul_n (p, a, b, n)
+ mp_ptr p;
+ mp_srcptr a;
+ mp_srcptr b;
+ mp_size_t n;
+#endif
+{
+ if (n < KARATSUBA_MUL_THRESHOLD)
+ mpn_mul_basecase (p, a, n, b, n);
+ else if (n < TOOM3_MUL_THRESHOLD)
+ {
+ /* Allocate workspace of fixed size on stack: fast! */
+#if TUNE_PROGRAM_BUILD
+ mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+#else
+ mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+#endif
+ mpn_kara_mul_n (p, a, b, n, ws);
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (n < FFT_MUL_THRESHOLD)
+#else
+ else
+#endif
+ {
+ /* Use workspace of unknown size in heap, as stack space may
+ * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
+ * multiplication will take much longer than malloc()/free(). */
+ mp_limb_t wsLen, *ws;
+ wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
+ ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+ mpn_toom3_mul_n (p, a, b, n, ws);
+ (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+ }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else
+ {
+ mpn_mul_fft_full (p, a, n, b, n);
+ }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000000..42ee3405d7
--- /dev/null
+++ b/rts/gmp/mpn/generic/perfsqr.c
@@ -0,0 +1,123 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+ zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue
+ modulo 0x100. */
+static unsigned char const sq_res_0x100[0x100] =
+{
+ 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+};
+
+int
+#if __STDC__
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+#else
+mpn_perfect_square_p (up, usize)
+ mp_srcptr up;
+ mp_size_t usize;
+#endif
+{
+ mp_limb_t rem;
+ mp_ptr root_ptr;
+ int res;
+ TMP_DECL (marker);
+
+ /* The first test excludes 55/64 (85.9%) of the perfect square candidates
+ in O(1) time. */
+ if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0)
+ return 0;
+
+#if defined (PP)
+ /* The second test excludes 30652543/30808063 (99.5%) of the remaining
+ perfect square candidates in O(n) time. */
+
+ /* Firstly, compute REM = A mod PP. */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6))
+ rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+ else
+ rem = mpn_mod_1 (up, usize, (mp_limb_t) PP);
+
+ /* Now decide if REM is a quadratic residue modulo the factors in PP. */
+
+ /* If A is just a few limbs, computing the square root does not take long
+ time, so things might run faster if we limit this loop according to the
+ size of A. */
+
+#if BITS_PER_MP_LIMB == 64
+ if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0)
+ return 0;
+ if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0)
+ return 0;
+#endif
+ if (((0x13D122F3L >> rem % 29) & 1) == 0)
+ return 0;
+ if (((0x5335FL >> rem % 23) & 1) == 0)
+ return 0;
+ if (((0x30AF3L >> rem % 19) & 1) == 0)
+ return 0;
+ if (((0x1A317L >> rem % 17) & 1) == 0)
+ return 0;
+ if (((0x161BL >> rem % 13) & 1) == 0)
+ return 0;
+ if (((0x23BL >> rem % 11) & 1) == 0)
+ return 0;
+ if (((0x017L >> rem % 7) & 1) == 0)
+ return 0;
+ if (((0x13L >> rem % 5) & 1) == 0)
+ return 0;
+ if (((0x3L >> rem % 3) & 1) == 0)
+ return 0;
+#endif
+
+ TMP_MARK (marker);
+
+ /* For the third and last test, we finally compute the square root,
+ to make sure we've really got a perfect square. */
+ root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
+
+ /* Iff mpn_sqrtrem returns zero, the square is perfect. */
+ res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+ TMP_FREE (marker);
+ return res;
+}
diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c
new file mode 100644
index 0000000000..387be9536d
--- /dev/null
+++ b/rts/gmp/mpn/generic/popcount.c
@@ -0,0 +1,93 @@
+/* popcount.c
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+ implement the POPC instruction. Disable pattern for now. */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+ ({ \
+ DItype __res; \
+ asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \
+ __res; \
+ })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+ You have to figure out how this works, I won't tell you! */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+ mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+ /* We have to go into some trouble to define these constants.
+ (For mp_limb_t being `long long'.) */
+ mp_limb_t cnst;
+ cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+ x -= (x & cnst) >> 1;
+ cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+ x = ((x & ~cnst) >> 2) + (x & cnst);
+ cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+ x = ((x >> 4) + x) & cnst;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x);
+ x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+ x -= (x & 0xaaaaaaaa) >> 1;
+ x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+ x = ((x >> 4) + x) & 0x0f0f0f0fL;
+ x = ((x >> 8) + x);
+ x = ((x >> 16) + x) & 0xff;
+#endif
+ return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_popcount (register mp_srcptr p, register mp_size_t size)
+#else
+mpn_popcount (p, size)
+ register mp_srcptr p;
+ register mp_size_t size;
+#endif
+{
+ unsigned long int popcnt;
+ mp_size_t i;
+
+ popcnt = 0;
+ for (i = 0; i < size; i++)
+ popcnt += popc_limb (p[i]);
+
+ return popcnt;
+}
diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000000..27179683b3
--- /dev/null
+++ b/rts/gmp/mpn/generic/pre_mod_1.c
@@ -0,0 +1,69 @@
+/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb,
+ divisor_limb_inverted) --
+ Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB.
+ DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB +
+ - 2^BITS_PER_MP_LIMB.
+ Return the single-limb remainder.
+
+Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+ mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted)
+#else
+mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted)
+ mp_srcptr dividend_ptr;
+ mp_size_t dividend_size;
+ mp_limb_t divisor_limb;
+ mp_limb_t divisor_limb_inverted;
+#endif
+{
+ mp_size_t i;
+ mp_limb_t n0, r;
+ int dummy;
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--)
+ {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+}
diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c
new file mode 100644
index 0000000000..dea4e20e56
--- /dev/null
+++ b/rts/gmp/mpn/generic/random.c
@@ -0,0 +1,43 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpn_random (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random (res_ptr, size)
+ mp_ptr res_ptr;
+ mp_size_t size;
+#endif
+{
+ mp_size_t i;
+
+ for (i = 0; i < size; i++)
+ res_ptr[i] = urandom ();
+
+ /* Make sure the most significant limb is non-zero. */
+ while (res_ptr[size - 1] == 0)
+ res_ptr[size - 1] = urandom ();
+}
diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000000..86682f81fa
--- /dev/null
+++ b/rts/gmp/mpn/generic/random2.c
@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+ of ones and zeroes. Suitable for border testing.
+
+Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined (__hpux) || defined (__alpha) || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random(). DEC OSF/1 1.2 random() returns a double. */
+long mrand48 ();
+static inline long
+random ()
+{
+ return mrand48 ();
+}
+#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+random ()
+{
+ return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+
+#else
+long random ();
+#endif
+
+/* It's a bit tricky to get this right, so please test the code well
+ if you hack with it. Some early versions of the function produced
+ random numbers with the leading limb == 0, and some versions never
+ made the most significant bit set. */
+
+void
+#if __STDC__
+mpn_random2 (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random2 (res_ptr, size)
+ mp_ptr res_ptr;
+ mp_size_t size;
+#endif
+{
+ int n_bits;
+ int bit_pos;
+ mp_size_t limb_pos;
+ unsigned int ran;
+ mp_limb_t limb;
+
+ limb = 0;
+
+ /* Start off in a random bit position in the most significant limb. */
+ bit_pos = random () & (BITS_PER_MP_LIMB - 1);
+
+ /* Least significant bit of RAN chooses string of ones/string of zeroes.
+ Make most significant limb be non-zero by setting bit 0 of RAN. */
+ ran = random () | 1;
+
+ for (limb_pos = size - 1; limb_pos >= 0; )
+ {
+ n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1;
+ if ((ran & 1) != 0)
+ {
+ /* Generate a string of ones. */
+ if (n_bits >= bit_pos)
+ {
+ res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1);
+ bit_pos += BITS_PER_MP_LIMB;
+ limb = (~(mp_limb_t) 0) << (bit_pos - n_bits);
+ }
+ else
+ {
+ limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1);
+ }
+ }
+ else
+ {
+ /* Generate a string of zeroes. */
+ if (n_bits >= bit_pos)
+ {
+ res_ptr[limb_pos--] = limb;
+ limb = 0;
+ bit_pos += BITS_PER_MP_LIMB;
+ }
+ }
+ bit_pos -= n_bits;
+ ran = random ();
+ }
+}
diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000000..59caf73529
--- /dev/null
+++ b/rts/gmp/mpn/generic/rshift.c
@@ -0,0 +1,88 @@
+/* mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+ and store the USIZE least significant limbs of the result at WP.
+ The bits shifted out to the right are returned.
+
+ Argument constraints:
+ 1. 0 < CNT < BITS_PER_MP_LIMB
+ 2. If the result is to be written over the input, WP must be <= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_rshift (register mp_ptr wp,
+ register mp_srcptr up, mp_size_t usize,
+ register unsigned int cnt)
+#else
+mpn_rshift (wp, up, usize, cnt)
+ register mp_ptr wp;
+ register mp_srcptr up;
+ mp_size_t usize;
+ register unsigned int cnt;
+#endif
+{
+ register mp_limb_t high_limb, low_limb;
+ register unsigned sh_1, sh_2;
+ register mp_size_t i;
+ mp_limb_t retval;
+
+#ifdef DEBUG
+ if (usize == 0 || cnt == 0)
+ abort ();
+#endif
+
+ sh_1 = cnt;
+
+#if 0
+ if (sh_1 == 0)
+ {
+ if (wp != up)
+ {
+ /* Copy from low end to high end, to allow specified input/output
+ overlapping. */
+ for (i = 0; i < usize; i++)
+ wp[i] = up[i];
+ }
+ return usize;
+ }
+#endif
+
+ wp -= 1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
+ high_limb = up[0];
+ retval = high_limb << sh_2;
+ low_limb = high_limb;
+
+ for (i = 1; i < usize; i++)
+ {
+ high_limb = up[i];
+ wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+ low_limb = high_limb;
+ }
+ wp[i] = low_limb >> sh_1;
+
+ return retval;
+}
diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..a269e34f5f
--- /dev/null
+++ b/rts/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,201 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+ quotient.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+ the NSIZE-DSIZE least significant quotient limbs at QP
+ and the DSIZE long remainder at NP. If QEXTRA_LIMBS is
+ non-zero, generate that many fraction bits and append them after the
+ other quotient limbs.
+ Return the most significant limb of the quotient, this is always 0 or 1.
+
+ Preconditions:
+ 0. NSIZE >= DSIZE.
+ 1. The most significant bit of the divisor must be set.
+ 2. QP must either not overlap with the input operands at all, or
+ QP + DSIZE >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+ 4. DSIZE >= 2. */
+
+
+#define PREINVERT_VIABLE \
+ (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
+
+mp_limb_t
+#if __STDC__
+mpn_sb_divrem_mn (mp_ptr qp,
+ mp_ptr np, mp_size_t nsize,
+ mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
+ mp_ptr qp;
+ mp_ptr np;
+ mp_size_t nsize;
+ mp_srcptr dp;
+ mp_size_t dsize;
+#endif
+{
+ mp_limb_t most_significant_q_limb = 0;
+ mp_size_t i;
+ mp_limb_t dx, d1, n0;
+ mp_limb_t dxinv;
+ int have_preinv;
+
+ ASSERT_ALWAYS (dsize > 2);
+
+ np += nsize - dsize;
+ dx = dp[dsize - 1];
+ d1 = dp[dsize - 2];
+ n0 = np[dsize - 1];
+
+ if (n0 >= dx)
+ {
+ if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
+ {
+ mpn_sub_n (np, np, dp, dsize);
+ most_significant_q_limb = 1;
+ }
+ }
+
+ /* If multiplication is much faster than division, preinvert the
+ most significant divisor limb before entering the loop. */
+ if (PREINVERT_VIABLE)
+ {
+ have_preinv = 0;
+ if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
+ {
+ invert_limb (dxinv, dx);
+ have_preinv = 1;
+ }
+ }
+
+ for (i = nsize - dsize - 1; i >= 0; i--)
+ {
+ mp_limb_t q;
+ mp_limb_t nx;
+ mp_limb_t cy_limb;
+
+ nx = np[dsize - 1];
+ np--;
+
+ if (nx == dx)
+ {
+ /* This might over-estimate q, but it's probably not worth
+ the extra code here to find out. */
+ q = ~(mp_limb_t) 0;
+
+#if 1
+ cy_limb = mpn_submul_1 (np, dp, dsize, q);
+#else
+ /* This should be faster on many machines */
+ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
+ cy = mpn_add_n (np, np, dp, dsize);
+ np[dsize] += cy;
+#endif
+
+ if (nx != cy_limb)
+ {
+ mpn_add_n (np, np, dp, dsize);
+ q--;
+ }
+
+ qp[i] = q;
+ }
+ else
+ {
+ mp_limb_t rx, r1, r0, p1, p0;
+
+ /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
+ usage when np[dsize-1] is used in an asm statement like
+ umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due
+ to registers being clobbered. gcc 2.95 i386 doesn't have the
+ problem. */
+ {
+ mp_limb_t workaround = np[dsize - 1];
+ if (PREINVERT_VIABLE && have_preinv)
+ udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+ else
+ udiv_qrnnd (q, r1, nx, workaround, dx);
+ }
+ umul_ppmm (p1, p0, d1, q);
+
+ r0 = np[dsize - 2];
+ rx = 0;
+ if (r1 < p1 || (r1 == p1 && r0 < p0))
+ {
+ p1 -= p0 < d1;
+ p0 -= d1;
+ q--;
+ r1 += dx;
+ rx = r1 < dx;
+ }
+
+ p1 += r0 < p0; /* cannot carry! */
+ rx -= r1 < p1; /* may become 11..1 if q is still too large */
+ r1 -= p1;
+ r0 -= p0;
+
+ cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
+
+ {
+ mp_limb_t cy1, cy2;
+ cy1 = r0 < cy_limb;
+ r0 -= cy_limb;
+ cy2 = r1 < cy1;
+ r1 -= cy1;
+ np[dsize - 1] = r1;
+ np[dsize - 2] = r0;
+ if (cy2 != rx)
+ {
+ mpn_add_n (np, np, dp, dsize);
+ q--;
+ }
+ }
+ qp[i] = q;
+ }
+ }
+
+ /* ______ ______ ______
+ |__rx__|__r1__|__r0__| partial remainder
+ ______ ______
+ - |__p1__|__p0__| partial product to subtract
+ ______ ______
+ - |______|cylimb|
+
+ rx is -1, 0 or 1. If rx=1, then q is correct (it should match
+ carry out). If rx=-1 then q is too large. If rx=0, then q might
+ be too large, but it is most likely correct.
+ */
+
+ return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000000..96f05ce854
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan0.c
@@ -0,0 +1,62 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+ 1. What if starting_bit is not within U? Caller's problem?
+ 2. Bit index should be 'unsigned'?
+
+ Argument constraints:
+ 1. U must sooner ot later have a limb with a clear bit.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan0 (register mp_srcptr up,
+ register unsigned long int starting_bit)
+#else
+mpn_scan0 (up, starting_bit)
+ register mp_srcptr up;
+ register unsigned long int starting_bit;
+#endif
+{
+ mp_size_t starting_word;
+ mp_limb_t alimb;
+ int cnt;
+ mp_srcptr p;
+
+ /* Start at the word implied by STARTING_BIT. */
+ starting_word = starting_bit / BITS_PER_MP_LIMB;
+ p = up + starting_word;
+ alimb = ~*p++;
+
+ /* Mask off any bits before STARTING_BIT in the first limb. */
+ alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+ while (alimb == 0)
+ alimb = ~*p++;
+
+ count_leading_zeros (cnt, alimb & -alimb);
+ return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000000..98e2e0dcc0
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan1.c
@@ -0,0 +1,62 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+ 1. What if starting_bit is not within U? Caller's problem?
+ 2. Bit index should be 'unsigned'?
+
+ Argument constraints:
+ 1. U must sooner ot later have a limb != 0.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan1 (register mp_srcptr up,
+ register unsigned long int starting_bit)
+#else
+mpn_scan1 (up, starting_bit)
+ register mp_srcptr up;
+ register unsigned long int starting_bit;
+#endif
+{
+ mp_size_t starting_word;
+ mp_limb_t alimb;
+ int cnt;
+ mp_srcptr p;
+
+ /* Start at the word implied by STARTING_BIT. */
+ starting_word = starting_bit / BITS_PER_MP_LIMB;
+ p = up + starting_word;
+ alimb = *p++;
+
+ /* Mask off any bits before STARTING_BIT in the first limb. */
+ alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+ while (alimb == 0)
+ alimb = *p++;
+
+ count_leading_zeros (cnt, alimb & -alimb);
+ return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000000..e6ccc92154
--- /dev/null
+++ b/rts/gmp/mpn/generic/set_str.c
@@ -0,0 +1,159 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base)
+ -- Convert a STR_LEN long base BASE byte string pointed to by STR to a
+ limb vector pointed to by RES_PTR. Return the number of limbs in
+ RES_PTR.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_size_t
+#if __STDC__
+mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base)
+#else
+mpn_set_str (xp, str, str_len, base)
+ mp_ptr xp;
+ const unsigned char *str;
+ size_t str_len;
+ int base;
+#endif
+{
+ mp_size_t size;
+ mp_limb_t big_base;
+ int indigits_per_limb;
+ mp_limb_t res_digit;
+
+ big_base = __mp_bases[base].big_base;
+ indigits_per_limb = __mp_bases[base].chars_per_limb;
+
+/* size = str_len / indigits_per_limb + 1; */
+
+ size = 0;
+
+ if ((base & (base - 1)) == 0)
+ {
+ /* The base is a power of 2. Read the input string from
+ least to most significant character/digit. */
+
+ const unsigned char *s;
+ int next_bitpos;
+ int bits_per_indigit = big_base;
+
+ res_digit = 0;
+ next_bitpos = 0;
+
+ for (s = str + str_len - 1; s >= str; s--)
+ {
+ int inp_digit = *s;
+
+ res_digit |= (mp_limb_t) inp_digit << next_bitpos;
+ next_bitpos += bits_per_indigit;
+ if (next_bitpos >= BITS_PER_MP_LIMB)
+ {
+ xp[size++] = res_digit;
+ next_bitpos -= BITS_PER_MP_LIMB;
+ res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+ }
+ }
+
+ if (res_digit != 0)
+ xp[size++] = res_digit;
+ }
+ else
+ {
+ /* General case. The base is not a power of 2. */
+
+ size_t i;
+ int j;
+ mp_limb_t cy_limb;
+
+ for (i = indigits_per_limb; i < str_len; i += indigits_per_limb)
+ {
+ res_digit = *str++;
+ if (base == 10)
+ { /* This is a common case.
+ Help the compiler to avoid multiplication. */
+ for (j = 1; j < indigits_per_limb; j++)
+ res_digit = res_digit * 10 + *str++;
+ }
+ else
+ {
+ for (j = 1; j < indigits_per_limb; j++)
+ res_digit = res_digit * base + *str++;
+ }
+
+ if (size == 0)
+ {
+ if (res_digit != 0)
+ {
+ xp[0] = res_digit;
+ size = 1;
+ }
+ }
+ else
+ {
+ cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+ cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+ if (cy_limb != 0)
+ xp[size++] = cy_limb;
+ }
+ }
+
+ big_base = base;
+ res_digit = *str++;
+ if (base == 10)
+ { /* This is a common case.
+ Help the compiler to avoid multiplication. */
+ for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+ {
+ res_digit = res_digit * 10 + *str++;
+ big_base *= 10;
+ }
+ }
+ else
+ {
+ for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+ {
+ res_digit = res_digit * base + *str++;
+ big_base *= base;
+ }
+ }
+
+ if (size == 0)
+ {
+ if (res_digit != 0)
+ {
+ xp[0] = res_digit;
+ size = 1;
+ }
+ }
+ else
+ {
+ cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+ cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+ if (cy_limb != 0)
+ xp[size++] = cy_limb;
+ }
+ }
+
+ return size;
+}
diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000000..760258a3e0
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqr_basecase.c
@@ -0,0 +1,83 @@
+/* mpn_sqr_basecase -- Internal routine to square two natural numbers
+ of length m and n.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
+#else
+mpn_sqr_basecase (prodp, up, n)
+ mp_ptr prodp;
+ mp_srcptr up;
+ mp_size_t n;
+#endif
+{
+ mp_size_t i;
+
+ {
+ /* N.B.! We need the superfluous indirection through argh to work around
+ a reloader bug in GCC 2.7.*. */
+ mp_limb_t x;
+ mp_limb_t argh;
+ x = up[0];
+ umul_ppmm (argh, prodp[0], x, x);
+ prodp[1] = argh;
+ }
+ if (n > 1)
+ {
+ mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
+ mp_ptr tp = tarr;
+ mp_limb_t cy;
+
+ /* must fit 2*n limbs in tarr */
+ ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
+
+ cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+ tp[n - 1] = cy;
+ for (i = 2; i < n; i++)
+ {
+ mp_limb_t cy;
+ cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+ tp[n + i - 2] = cy;
+ }
+ for (i = 1; i < n; i++)
+ {
+ mp_limb_t x;
+ x = up[i];
+ umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
+ }
+ {
+ mp_limb_t cy;
+ cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+ cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
+ prodp[2 * n - 1] += cy;
+ }
+ }
+}
diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000000..ee3b5144dd
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,509 @@
+/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+
+ Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
+ Write the remainder at REM_PTR, if REM_PTR != NULL.
+ Return the size of the remainder.
+ (The size of the root is always half of the size of the operand.)
+
+ OP_PTR and ROOT_PTR may not point to the same object.
+ OP_PTR and REM_PTR may point to the same object.
+
+ If REM_PTR is NULL, only the root is computed and the return value of
+ the function is 0 if OP is a perfect square, and *any* non-zero number
+ otherwise.
+
+Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* This code is just correct if "unsigned char" has at least 8 bits. It
+ doesn't help to use CHAR_BIT from limits.h, as the real problem is
+ the static arrays. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Square root algorithm:
+
+ 1. Shift OP (the input) to the left an even number of bits s.t. there
+ are an even number of words and either (or both) of the most
+ significant bits are set. This way, sqrt(OP) has exactly half as
+ many words as OP, and has its most significant bit set.
+
+ 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
+ This approximation is used for the first single-precision
+ iterations of Newton's method, yielding a full-word approximation
+ to sqrt(OP).
+
+ 3. Perform multiple-precision Newton iteration until we have the
+ exact result. Only about half of the input operand is used in
+ this calculation, as the square root is perfectly determinable
+ from just the higher half of a number. */
+
+/* Define this macro for IEEE P854 machines with a fast sqrt instruction. */
+#if defined __GNUC__ && ! defined __SOFT_FLOAT
+
+#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (__HAVE_68881__)
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (__hppa) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \
+ __sqrt_res; \
+ })
+#endif
+
+#if 0
+#if defined (__i386__) || defined (__i486__)
+#define SQRT(a) \
+ ({ \
+ double __sqrt_res; \
+ asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \
+ __sqrt_res; \
+ })
+#endif
+#endif
+
+#endif
+
+#ifndef SQRT
+
+/* Tables for initial approximation of the square root. These are
+ indexed with bits 1-8 of the operand for which the square root is
+ calculated, where bit 0 is the most significant non-zero bit. I.e.
+ the most significant one-bit is not used, since that per definition
+ is one. Likewise, the tables don't return the highest bit of the
+ result. That bit must be inserted by or:ing the returned value with
+ 0x100. This way, we get a 9-bit approximation from 8-bit tables! */
+
+/* Table to be used for operands with an even total number of bits.
+ (Exactly as in the decimal system there are similarities between the
+ square root of numbers with the same initial digits and an even
+ difference in the total number of digits. Consider the square root
+ of 1, 10, 100, 1000, ...) */
+static const unsigned char even_approx_tab[256] =
+{
+ 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
+ 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
+ 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
+ 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
+ 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
+ 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
+ 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
+ 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
+ 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
+ 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
+ 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
+ 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
+ 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
+ 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
+ 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
+ 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
+ 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
+ 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
+ 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
+ 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
+ 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
+ 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
+ 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
+ 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
+ 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
+ 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
+ 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
+ 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
+ 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
+ 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
+ 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
+};
+
+/* Table to be used for operands with an odd total number of bits.
+ (Further comments before previous table.) */
+static const unsigned char odd_approx_tab[256] =
+{
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
+ 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
+ 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
+ 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
+ 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
+ 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
+ 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
+ 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
+ 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
+ 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
+ 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
+ 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
+ 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
+ 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
+ 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
+ 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
+ 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
+ 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
+ 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
+ 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
+ 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
+ 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
+ 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
+ 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
+ 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
+ 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
+ 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
+ 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
+ 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
+ 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
+ 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
+};
+#endif
+
+
+mp_size_t
+#if __STDC__
+mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
+#else
+mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+ mp_ptr root_ptr;
+ mp_ptr rem_ptr;
+ mp_srcptr op_ptr;
+ mp_size_t op_size;
+#endif
+{
+ /* R (root result) */
+ mp_ptr rp; /* Pointer to least significant word */
+ mp_size_t rsize; /* The size in words */
+
+ /* T (OP shifted to the left a.k.a. normalized) */
+ mp_ptr tp; /* Pointer to least significant word */
+ mp_size_t tsize; /* The size in words */
+ mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */
+ mp_limb_t t_high0, t_high1; /* The two most significant words */
+
+ /* TT (temporary for numerator/remainder) */
+ mp_ptr ttp; /* Pointer to least significant word */
+
+ /* X (temporary for quotient in main loop) */
+ mp_ptr xp; /* Pointer to least significant word */
+ mp_size_t xsize; /* The size in words */
+
+ unsigned cnt;
+ mp_limb_t initial_approx; /* Initially made approximation */
+ mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */
+ mp_size_t tmp;
+ mp_size_t i;
+
+ mp_limb_t cy_limb;
+ TMP_DECL (marker);
+
+ /* If OP is zero, both results are zero. */
+ if (op_size == 0)
+ return 0;
+
+ count_leading_zeros (cnt, op_ptr[op_size - 1]);
+ tsize = op_size;
+ if ((tsize & 1) != 0)
+ {
+ cnt += BITS_PER_MP_LIMB;
+ tsize++;
+ }
+
+ rsize = tsize / 2;
+ rp = root_ptr;
+
+ TMP_MARK (marker);
+
+ /* Shift OP an even number of bits into T, such that either the most or
+ the second most significant bit is set, and such that the number of
+ words in T becomes even. This way, the number of words in R=sqrt(OP)
+ is exactly half as many as in OP, and the most significant bit of R
+ is set.
+
+ Also, the initial approximation is simplified by this up-shifted OP.
+
+ Finally, the Newtonian iteration which is the main part of this
+ program performs division by R. The fast division routine expects
+ the divisor to be "normalized" in exactly the sense of having the
+ most significant bit set. */
+
+ tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+ if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
+ t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
+ (cnt & ~1) % BITS_PER_MP_LIMB);
+ else
+ MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+
+ if (cnt >= BITS_PER_MP_LIMB)
+ tp[0] = 0;
+
+ t_high0 = tp[tsize - 1];
+ t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */
+
+/* Is there a fast sqrt instruction defined for this machine? */
+#ifdef SQRT
+ {
+ initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
+ /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
+ become incorrect due to overflow in the conversion from double to
+ mp_limb_t above. It will typically be zero in that case, but might be
+ a small number on some machines. The most significant bit of
+ INITIAL_APPROX should be set, so that bit is a good overflow
+ indication. */
+ if ((mp_limb_signed_t) initial_approx >= 0)
+ initial_approx = ~(mp_limb_t)0;
+ }
+#else
+ /* Get a 9 bit approximation from the tables. The tables expect to
+ be indexed with the 8 high bits right below the highest bit.
+ Also, the highest result bit is not returned by the tables, and
+ must be or:ed into the result. The scheme gives 9 bits of start
+ approximation with just 256-entry 8 bit tables. */
+
+ if ((cnt & 1) == 0)
+ {
+ /* The most significant bit of t_high0 is set. */
+ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
+ initial_approx &= 0xff;
+ initial_approx = even_approx_tab[initial_approx];
+ }
+ else
+ {
+ /* The most significant bit of t_high0 is unset,
+ the second most significant is set. */
+ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
+ initial_approx &= 0xff;
+ initial_approx = odd_approx_tab[initial_approx];
+ }
+ initial_approx |= 0x100;
+ initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
+
+ /* Perform small precision Newtonian iterations to get a full word
+ approximation. For small operands, these iterations will do the
+ entire job. */
+ if (t_high0 == ~(mp_limb_t)0)
+ initial_approx = t_high0;
+ else
+ {
+ mp_limb_t quot;
+
+ if (t_high0 >= initial_approx)
+ initial_approx = t_high0 + 1;
+
+ /* First get about 18 bits with pure C arithmetics. */
+ quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
+ initial_approx = (initial_approx + quot) / 2;
+ initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+
+ /* Now get a full word by one (or for > 36 bit machines) several
+ iterations. */
+ for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
+ {
+ mp_limb_t ignored_remainder;
+
+ udiv_qrnnd (quot, ignored_remainder,
+ t_high0, t_high1, initial_approx);
+ initial_approx = (initial_approx + quot) / 2;
+ initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+ }
+ }
+#endif
+
+ rp[0] = initial_approx;
+ rsize = 1;
+
+#ifdef SQRT_DEBUG
+ printf ("\n\nT = ");
+ mpn_dump (tp, tsize);
+#endif
+
+ if (tsize > 2)
+ {
+ /* Determine the successive precisions to use in the iteration. We
+ minimize the precisions, beginning with the highest (i.e. last
+ iteration) to the lowest (i.e. first iteration). */
+
+ xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+ ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+ t_end_ptr = tp + tsize;
+
+ tmp = tsize / 2;
+ for (i = 0;; i++)
+ {
+ tsize = (tmp + 1) / 2;
+ if (tmp == tsize)
+ break;
+ tsizes[i] = tsize + tmp;
+ tmp = tsize;
+ }
+
+ /* Main Newton iteration loop. For big arguments, most of the
+ time is spent here. */
+
+ /* It is possible to do a great optimization here. The successive
+ divisors in the mpn_divmod call below have more and more leading
+ words equal to its predecessor. Therefore the beginning of
+ each division will repeat the same work as did the last
+ division. If we could guarantee that the leading words of two
+ consecutive divisors are the same (i.e. in this case, a later
+ divisor has just more digits at the end) it would be a simple
+ matter of just using the old remainder of the last division in
+ a subsequent division, to take care of this optimization. This
+ idea would surely make a difference even for small arguments. */
+
+ /* Loop invariants:
+
+ R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
+ X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
+ R <= shiftdown_to_same_size(X). */
+
+ while (--i >= 0)
+ {
+ mp_limb_t cy;
+#ifdef SQRT_DEBUG
+ mp_limb_t old_least_sign_r = rp[0];
+ mp_size_t old_rsize = rsize;
+
+ printf ("R = ");
+ mpn_dump (rp, rsize);
+#endif
+ tsize = tsizes[i];
+
+ /* Need to copy the numerator into temporary space, as
+ mpn_divmod overwrites its numerator argument with the
+ remainder (which we currently ignore). */
+ MPN_COPY (ttp, t_end_ptr - tsize, tsize);
+ cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
+ xsize = tsize - rsize;
+
+#ifdef SQRT_DEBUG
+ printf ("X =%d ", cy);
+ mpn_dump (xp, xsize);
+#endif
+
+ /* Add X and R with the most significant limbs aligned,
+ temporarily ignoring at least one limb at the low end of X. */
+ tmp = xsize - rsize;
+ cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
+
+ /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
+ intermediate roots that'd need an extra bit. We don't want to
+ handle that since it would make the subsequent divisor
+ non-normalized, so round such roots down to be only ones in the
+ current precision. */
+ if (cy == 2)
+ {
+ mp_size_t j;
+ for (j = xsize; j >= 0; j--)
+ xp[j] = ~(mp_limb_t)0;
+ }
+
+ /* Divide X by 2 and put the result in R. This is the new
+ approximation. Shift in the carry from the addition. */
+ mpn_rshift (rp, xp, xsize, 1);
+ rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+ rsize = xsize;
+#ifdef SQRT_DEBUG
+ if (old_least_sign_r != rp[rsize - old_rsize])
+ printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
+ i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
+ 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
+#endif
+ }
+ }
+
+#ifdef SQRT_DEBUG
+ printf ("(final) R = ");
+ mpn_dump (rp, rsize);
+#endif
+
+ /* We computed the square root of OP * 2**(2*floor(cnt/2)).
+ This has resulted in R being 2**floor(cnt/2) to large.
+ Shift it down here to fix that. */
+ if (cnt / 2 != 0)
+ {
+ mpn_rshift (rp, rp, rsize, cnt/2);
+ rsize -= rp[rsize - 1] == 0;
+ }
+
+ /* Calculate the remainder. */
+ mpn_mul_n (tp, rp, rp, rsize);
+ tsize = rsize + rsize;
+ tsize -= tp[tsize - 1] == 0;
+ if (op_size < tsize
+ || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
+ {
+ /* R is too large. Decrement it. */
+
+ /* These operations can't overflow. */
+ cy_limb = mpn_sub_n (tp, tp, rp, rsize);
+ cy_limb += mpn_sub_n (tp, tp, rp, rsize);
+ mpn_decr_u (tp + rsize, cy_limb);
+ mpn_incr_u (tp, (mp_limb_t) 1);
+
+ mpn_decr_u (rp, (mp_limb_t) 1);
+
+#ifdef SQRT_DEBUG
+ printf ("(adjusted) R = ");
+ mpn_dump (rp, rsize);
+#endif
+ }
+
+ if (rem_ptr != NULL)
+ {
+ cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
+ MPN_NORMALIZE (rem_ptr, op_size);
+ TMP_FREE (marker);
+ return op_size;
+ }
+ else
+ {
+ int res;
+ res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
+ TMP_FREE (marker);
+ return res;
+ }
+}
diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000000..4f2f06099c
--- /dev/null
+++ b/rts/gmp/mpn/generic/sub_n.c
@@ -0,0 +1,62 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ register mp_srcptr s2_ptr;
+ mp_size_t size;
+#endif
+{
+ register mp_limb_t x, y, cy;
+ register mp_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do
+ {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to subtrahend */
+ cy = (y < cy); /* get out carry from that addition */
+ y = x - y; /* main subtract */
+ cy = (y > x) + cy; /* get out carry from the subtract, combine */
+ res_ptr[j] = y;
+ }
+ while (++j != 0);
+
+ return cy;
+}
diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000000..c7c08ee4af
--- /dev/null
+++ b/rts/gmp/mpn/generic/submul_1.c
@@ -0,0 +1,65 @@
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+ by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
+ from the limb vector pointed to by RES_PTR. Return the most significant
+ limb of the product, adjusted for carry-out from the subtraction.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+ register mp_ptr res_ptr;
+ register mp_srcptr s1_ptr;
+ mp_size_t s1_size;
+ register mp_limb_t s2_limb;
+{
+ register mp_limb_t cy_limb;
+ register mp_size_t j;
+ register mp_limb_t prod_high, prod_low;
+ register mp_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do
+ {
+ umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x - prod_low;
+ cy_limb += (prod_low > x);
+ res_ptr[j] = prod_low;
+ }
+ while (++j != 0);
+
+ return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000000..b748b5d810
--- /dev/null
+++ b/rts/gmp/mpn/generic/tdiv_qr.c
@@ -0,0 +1,401 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+ write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If
+ qxn is non-zero, generate that many fraction limbs and append them after the
+ other quotient limbs, and update the remainder accordningly. The input
+ operands are unaffected.
+
+ Preconditions:
+ 1. The most significant limb of of the divisor must be non-zero.
+ 2. No argument overlap is permitted. (??? relax this ???)
+ 3. nn >= dn, even if qxn is non-zero. (??? relax this ???)
+
+ The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+ complexity of multiplication.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+/* Extract the middle limb from ((h,,l) << cnt) */
+#define SHL(h,l,cnt) \
+ ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
+
+void
+#if __STDC__
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+ mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+#else
+mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
+ mp_ptr qp;
+ mp_ptr rp;
+ mp_size_t qxn;
+ mp_srcptr np;
+ mp_size_t nn;
+ mp_srcptr dp;
+ mp_size_t dn;
+#endif
+{
+ /* FIXME:
+ 1. qxn
+ 2. pass allocated storage in additional parameter?
+ */
+ if (qxn != 0)
+ abort ();
+
+ switch (dn)
+ {
+ case 0:
+ DIVIDE_BY_ZERO;
+
+ case 1:
+ {
+ rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
+ return;
+ }
+
+ case 2:
+ {
+ int cnt;
+ mp_ptr n2p, d2p;
+ mp_limb_t qhl, cy;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ count_leading_zeros (cnt, dp[dn - 1]);
+ if (cnt != 0)
+ {
+ d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_lshift (d2p, dp, dn, cnt);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np, nn, cnt);
+ n2p[nn] = cy;
+ qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+ if (cy == 0)
+ qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */
+ }
+ else
+ {
+ d2p = (mp_ptr) dp;
+ n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np, nn);
+ qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */
+ }
+
+ if (cnt != 0)
+ mpn_rshift (rp, n2p, dn, cnt);
+ else
+ MPN_COPY (rp, n2p, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ default:
+ {
+ int adjust;
+ TMP_DECL (marker);
+ TMP_MARK (marker);
+ adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */
+ if (nn + adjust >= 2 * dn)
+ {
+ mp_ptr n2p, d2p;
+ mp_limb_t cy;
+ int cnt;
+ count_leading_zeros (cnt, dp[dn - 1]);
+
+ qp[nn - dn] = 0; /* zero high quotient limb */
+ if (cnt != 0) /* normalize divisor if needed */
+ {
+ d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+ mpn_lshift (d2p, dp, dn, cnt);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np, nn, cnt);
+ n2p[nn] = cy;
+ nn += adjust;
+ }
+ else
+ {
+ d2p = (mp_ptr) dp;
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np, nn);
+ n2p[nn] = 0;
+ nn += adjust;
+ }
+
+ if (dn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ else if (dn < BZ_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+ else
+ {
+ /* Perform 2*dn / dn limb divisions as long as the limbs
+ in np last. */
+ mp_ptr q2p = qp + nn - 2 * dn;
+ n2p += nn - 2 * dn;
+ mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+ nn -= dn;
+ while (nn >= 2 * dn)
+ {
+ mp_limb_t c;
+ q2p -= dn; n2p -= dn;
+ c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+ ASSERT_ALWAYS (c == 0);
+ nn -= dn;
+ }
+
+ if (nn != dn)
+ {
+ n2p -= nn - dn;
+ /* In theory, we could fall out to the cute code below
+ since we now have exactly the situation that code
+ is designed to handle. We botch this badly and call
+ the basic mpn_sb_divrem_mn! */
+ if (dn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+ else
+ mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+ }
+ }
+
+
+ if (cnt != 0)
+ mpn_rshift (rp, n2p, dn, cnt);
+ else
+ MPN_COPY (rp, n2p, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ /* When we come here, the numerator/partial remainder is less
+ than twice the size of the denominator. */
+
+ {
+ /* Problem:
+
+ Divide a numerator N with nn limbs by a denominator D with dn
+ limbs forming a quotient of nn-dn+1 limbs. When qn is small
+ compared to dn, conventional division algorithms perform poorly.
+ We want an algorithm that has an expected running time that is
+ dependent only on qn. It is assumed that the most significant
+ limb of the numerator is smaller than the most significant limb
+ of the denominator.
+
+ Algorithm (very informally stated):
+
+ 1) Divide the 2 x qn most significant limbs from the numerator
+ by the qn most significant limbs from the denominator. Call
+ the result qest. This is either the correct quotient, but
+ might be 1 or 2 too large. Compute the remainder from the
+ division. (This step is implemented by a mpn_divrem call.)
+
+ 2) Is the most significant limb from the remainder < p, where p
+ is the product of the most significant limb from the quotient
+ and the next(d). (Next(d) denotes the next ignored limb from
+ the denominator.) If it is, decrement qest, and adjust the
+ remainder accordingly.
+
+ 3) Is the remainder >= qest? If it is, qest is the desired
+ quotient. The algorithm terminates.
+
+ 4) Subtract qest x next(d) from the remainder. If there is
+ borrow out, decrement qest, and adjust the remainder
+ accordingly.
+
+ 5) Skip one word from the denominator (i.e., let next(d) denote
+ the next less significant limb. */
+
+ mp_size_t qn;
+ mp_ptr n2p, d2p;
+ mp_ptr tp;
+ mp_limb_t cy;
+ mp_size_t in, rn;
+ mp_limb_t quotient_too_large;
+ int cnt;
+
+ qn = nn - dn;
+ qp[qn] = 0; /* zero high quotient limb */
+ qn += adjust; /* qn cannot become bigger */
+
+ if (qn == 0)
+ {
+ MPN_COPY (rp, np, dn);
+ TMP_FREE (marker);
+ return;
+ }
+
+ in = dn - qn; /* (at least partially) ignored # of limbs in ops */
+ /* Normalize denominator by shifting it to the left such that its
+ most significant bit is set. Then shift the numerator the same
+ amount, to mathematically preserve quotient. */
+ count_leading_zeros (cnt, dp[dn - 1]);
+ if (cnt != 0)
+ {
+ d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
+
+ mpn_lshift (d2p, dp + in, qn, cnt);
+ d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
+
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+ cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+ if (adjust)
+ {
+ n2p[2 * qn] = cy;
+ n2p++;
+ }
+ else
+ {
+ n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
+ }
+ }
+ else
+ {
+ d2p = (mp_ptr) dp + in;
+
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+ MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+ if (adjust)
+ {
+ n2p[2 * qn] = 0;
+ n2p++;
+ }
+ }
+
+ /* Get an approximate quotient using the extracted operands. */
+ if (qn == 1)
+ {
+ mp_limb_t q0, r0;
+ mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+ /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+ temps here. This doesn't hurt code quality on any machines
+ so we do it unconditionally. */
+ gcc272bug_n1 = n2p[1];
+ gcc272bug_n0 = n2p[0];
+ gcc272bug_d0 = d2p[0];
+ udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
+ n2p[0] = r0;
+ qp[0] = q0;
+ }
+ else if (qn == 2)
+ mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+ else if (qn < BZ_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
+ else
+ mpn_bz_divrem_n (qp, n2p, d2p, qn);
+
+ rn = qn;
+ /* Multiply the first ignored divisor limb by the most significant
+ quotient limb. If that product is > the partial remainder's
+ most significant limb, we know the quotient is too large. This
+ test quickly catches most cases where the quotient is too large;
+ it catches all cases where the quotient is 2 too large. */
+ {
+ mp_limb_t dl, x;
+ mp_limb_t h, l;
+
+ if (in - 2 < 0)
+ dl = 0;
+ else
+ dl = dp[in - 2];
+
+ x = SHL (dp[in - 1], dl, cnt);
+ umul_ppmm (h, l, x, qp[qn - 1]);
+
+ if (n2p[qn - 1] < h)
+ {
+ mp_limb_t cy;
+
+ mpn_decr_u (qp, (mp_limb_t) 1);
+ cy = mpn_add_n (n2p, n2p, d2p, qn);
+ if (cy)
+ {
+ /* The partial remainder is safely large. */
+ n2p[qn] = cy;
+ ++rn;
+ }
+ }
+ }
+
+ quotient_too_large = 0;
+ if (cnt != 0)
+ {
+ mp_limb_t cy1, cy2;
+
+ /* Append partially used numerator limb to partial remainder. */
+ cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
+ n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
+
+ /* Update partial remainder with partially used divisor limb. */
+ cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
+ if (qn != rn)
+ {
+ if (n2p[qn] < cy2)
+ abort ();
+ n2p[qn] -= cy2;
+ }
+ else
+ {
+ n2p[qn] = cy1 - cy2;
+
+ quotient_too_large = (cy1 < cy2);
+ ++rn;
+ }
+ --in;
+ }
+ /* True: partial remainder now is neutral, i.e., it is not shifted up. */
+
+ tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+
+ if (in < qn)
+ {
+ if (in == 0)
+ {
+ MPN_COPY (rp, n2p, rn);
+ if (rn != dn)
+ abort ();
+ goto foo;
+ }
+ mpn_mul (tp, qp, qn, dp, in);
+ }
+ else
+ mpn_mul (tp, dp, in, qp, qn);
+
+ cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+ MPN_COPY (rp + in, n2p, dn - in);
+ quotient_too_large |= cy;
+ cy = mpn_sub_n (rp, np, tp, in);
+ cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+ quotient_too_large |= cy;
+ foo:
+ if (quotient_too_large)
+ {
+ mpn_decr_u (qp, (mp_limb_t) 1);
+ mpn_add_n (rp, rp, dp, dn);
+ }
+ }
+ TMP_FREE (marker);
+ return;
+ }
+ }
+}
diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000000..061cce86e1
--- /dev/null
+++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c
@@ -0,0 +1,131 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+ division.
+
+ Contributed by Peter L. Montgomery.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE
+ TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+ GNU MP RELEASE.
+
+
+Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+ mp_limb_t *rp, a1, a0, d;
+{
+ mp_limb_t q, r;
+ mp_limb_t c0, c1, b1;
+
+ if ((mp_limb_signed_t) d >= 0)
+ {
+ if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
+ {
+ /* dividend, divisor, and quotient are nonnegative */
+ sdiv_qrnnd (q, r, a1, a0, d);
+ }
+ else
+ {
+ /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+ sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
+ /* Divide (c1*2^32 + c0) by d */
+ sdiv_qrnnd (q, r, c1, c0, d);
+ /* Add 2^31 to quotient */
+ q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+ }
+ }
+ else
+ {
+ b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */
+ c1 = a1 >> 1; /* A/2 */
+ c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
+
+ if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */
+ {
+ sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+ r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */
+ if ((d & 1) != 0)
+ {
+ if (r >= q)
+ r = r - q;
+ else if (q - r <= d)
+ {
+ r = r - q + d;
+ q--;
+ }
+ else
+ {
+ r = r - q + 2*d;
+ q -= 2;
+ }
+ }
+ }
+ else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */
+ {
+ c1 = (b1 - 1) - c1;
+ c0 = ~c0; /* logical NOT */
+
+ sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+ q = ~q; /* (A/2)/b1 */
+ r = (b1 - 1) - r;
+
+ r = 2*r + (a0 & 1); /* A/(2*b1) */
+
+ if ((d & 1) != 0)
+ {
+ if (r >= q)
+ r = r - q;
+ else if (q - r <= d)
+ {
+ r = r - q + d;
+ q--;
+ }
+ else
+ {
+ r = r - q + 2*d;
+ q -= 2;
+ }
+ }
+ }
+ else /* Implies c1 = b1 */
+ { /* Hence a1 = d - 1 = 2*b1 - 1 */
+ if (a0 >= -d)
+ {
+ q = -1;
+ r = a0 + d;
+ }
+ else
+ {
+ q = -2;
+ r = a0 + 2*d;
+ }
+ }
+ }
+
+ *rp = r;
+ return q;
+}
diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README
new file mode 100644
index 0000000000..97e7abe011
--- /dev/null
+++ b/rts/gmp/mpn/hppa/README
@@ -0,0 +1,91 @@
+This directory contains mpn functions for various HP PA-RISC chips. Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+ Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+ instructions below (but some sw pipelining is needed to avoid the
+ xmpyu-fstds delay):
+
+ fldds s1_ptr
+
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+
+ addib Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+ (asymptotically) on the PA7100, using the instructions below. With proper
+ sw pipelining and the unrolling level below, the speed becomes 8
+ cycles/limb.
+
+ fldds s1_ptr
+ fldds s1_ptr
+
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ addc
+ addc
+ addc
+ addc
+ addc %r0,%r0,cy-limb
+
+ ldws res_ptr
+ ldws res_ptr
+ ldws res_ptr
+ ldws res_ptr
+ add
+ stws res_ptr
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+
+ addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+ support emerges. But we want to use 64-bit operations whenever possible,
+ in particular for loads and stores. It is possible to handle mpn_add_n
+ efficiently by rotating (when s1/s2 are aligned), masking+bit field
+ inserting when (they are not). The speed should double compared to the
+ code used today.
diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s
new file mode 100644
index 0000000000..c53b2f71b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/add_n.s
@@ -0,0 +1,58 @@
+; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless. We can't come under 5 cycles/limb anyway.
+
+ .code
+ .export __gmpn_add_n
+__gmpn_add_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,= -1,%r23,L$end ; check for (SIZE == 1)
+ add %r20,%r19,%r28 ; add first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,<> -1,%r23,L$loop
+ addc %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h
new file mode 100644
index 0000000000..98b6d9ce3c
--- /dev/null
+++ b/rts/gmp/mpn/hppa/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the PA7100 using GCC. */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 172
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 185
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 96
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 122
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 33
+#endif
diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 0000000000..c7d218f922
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 11 cycles/limb on a PA7000. With the used instructions, it
+; can not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+ .code
+ .export __gmpn_addmul_1
+__gmpn_addmul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end ldw 0(%r26),%r29
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ add %r29,%r1,%r19
+ stw %r19,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 0000000000..4512fddec9
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can
+; not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu. Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively. We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue. If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early! (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+ .code
+ .export __gmpn_mul_1
+__gmpn_mul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop fldws,ma 4(%r25),%fr5
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ fstds %fr6,-16(%r30)
+ ldw -16(%r30),%r28
+ ldo -64(%r30),%r30
+ bv 0(%r2)
+ fstws %fr6R,0(%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 0000000000..4f4be08b37
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+ .code
+ .export __gmpn_add_n
+__gmpn_add_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L$rest
+ add %r20,%r19,%r28 ; add first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L$loop
+ addc %r20,%r19,%r28
+
+L$rest addib,= 4,%r23,L$end
+ nop
+L$eloop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L$eloop
+ addc %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 0000000000..04db06822e
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr %r26
+#define s1_ptr %r25
+#define size %r24
+#define s2_limb %r23
+
+#define cylimb %r28
+#define s0 %r19
+#define s1 %r20
+#define s2 %r3
+#define s3 %r4
+#define lo0 %r21
+#define lo1 %r5
+#define lo2 %r6
+#define lo3 %r7
+#define hi0 %r22
+#define hi1 %r23 /* safe to reuse */
+#define hi2 %r29
+#define hi3 %r1
+
+ .code
+ .export __gmpn_addmul_1
+__gmpn_addmul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb ; clear cy and cylimb
+ addib,< -4,size,L$few_limbs
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L$0
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ add s0,lo0,s0
+ addib,< -1,size,L$few_limbs
+ stws,ma s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0 fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L$end
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ add s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ addc s1,lo1,s1
+ fstds %fr9,0(%r31)
+ addc s2,lo2,s2
+ fstds %fr10,8(%r31)
+ addc s3,lo3,s3
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L$loop
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addc s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ addc s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ addc s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+L$few_limbs
+ addib,=,n 4,size,L$ret
+L$loop2 fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L$loop2
+ nop
+
+L$ret addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 0000000000..31669b1a55
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA __gmpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __gmpn_lshift
+__gmpn_lshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L$0004
+ vshd %r0,%r22,%r28 ; compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,<= -5,%r24,L$rest
+ vshd %r22,%r29,%r20
+
+L$loop ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -4,%r24,L$loop
+ vshd %r22,%r29,%r20
+
+L$rest addib,= 4,%r24,L$end1
+ nop
+L$eloop ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,<= -1,%r24,L$end2
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -1,%r24,L$eloop
+ vshd %r22,%r29,%r20
+
+L$end1 stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+L$end2 stws,mb %r20,-4(0,%r26)
+L$0004 vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 0000000000..d32b10b4b1
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA __gmpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __gmpn_rshift
+__gmpn_rshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L$0004
+ vshd %r22,%r0,%r28 ; compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,<= -5,%r24,L$rest
+ vshd %r29,%r22,%r20
+
+L$loop ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -4,%r24,L$loop
+ vshd %r29,%r22,%r20
+
+L$rest addib,= 4,%r24,L$end1
+ nop
+L$eloop ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,<= -1,%r24,L$end2
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -1,%r24,L$eloop
+ vshd %r29,%r22,%r20
+
+L$end1 stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+L$end2 stws,ma %r20,4(0,%r26)
+L$0004 vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 0000000000..0eec41c4b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+ .code
+ .export __gmpn_sub_n
+__gmpn_sub_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L$rest
+ sub %r20,%r19,%r28 ; subtract first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L$loop
+ subb %r20,%r19,%r28
+
+L$rest addib,= 4,%r23,L$end
+ nop
+L$eloop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L$eloop
+ subb %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 0000000000..0fba21dcef
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr %r26
+#define s1_ptr %r25
+#define size %r24
+#define s2_limb %r23
+
+#define cylimb %r28
+#define s0 %r19
+#define s1 %r20
+#define s2 %r3
+#define s3 %r4
+#define lo0 %r21
+#define lo1 %r5
+#define lo2 %r6
+#define lo3 %r7
+#define hi0 %r22
+#define hi1 %r23 /* safe to reuse */
+#define hi2 %r29
+#define hi3 %r1
+
+ .code
+ .export __gmpn_submul_1
+__gmpn_submul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb ; clear cy and cylimb
+ addib,< -4,size,L$few_limbs
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L$0
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ sub s0,lo0,s0
+ add s0,lo0,%r0 ; invert cy
+ addib,< -1,size,L$few_limbs
+ stws,ma s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0 fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L$end
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ sub s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ subb s1,lo1,s1
+ fstds %fr9,0(%r31)
+ subb s2,lo2,s2
+ fstds %fr10,8(%r31)
+ subb s3,lo3,s3
+ subb %r0,%r0,lo0 ; these two insns ...
+ add lo0,lo0,%r0 ; ... just invert cy
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L$loop
+ addc %r0,hi3,cylimb ; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ sub s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ subb s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ subb s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ subb s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+ subb %r0,%r0,lo0 ; these two insns ...
+ add lo0,lo0,%r0 ; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+L$few_limbs
+ addib,=,n 4,size,L$ret
+L$loop2 fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ sub s0,lo0,s0
+ add s0,lo0,%r0 ; invert cy
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L$loop2
+ nop
+
+L$ret addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 0000000000..20a5b5ce0a
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r26
+; s1_ptr r25
+; size r24
+; s2_limb r23
+
+; This runs at 12 cycles/limb on a PA7000. With the used instructions, it
+; can not become faster due to data cache contention after a store. On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __gmpn_addmul_1, if we use
+; sub,>>= %r29,%r19,%r22
+; addi 1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+ .code
+ .export __gmpn_submul_1
+__gmpn_submul_1
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) ; move s2_limb ...
+ addib,= -1,%r24,L$just_one_limb
+ fldws -16(%r30),%fr4 ; ... into fr4
+ add %r0,%r0,%r0 ; clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 ; least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L$end
+ ldw -12(%r30),%r1
+
+; Main loop
+L$loop ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L$loop
+ ldw -12(%r30),%r1
+
+L$end ldw 0(%r26),%r29
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+L$just_one_limb
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ sub %r29,%r1,%r22
+ add %r22,%r1,%r0
+ stw %r22,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
new file mode 100644
index 0000000000..b83d6f4dd2
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
@@ -0,0 +1,80 @@
+; HP-PA __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr gr26
+; n1 gr25
+; n0 gr24
+; d gr23
+
+ .code
+L$0000 .word 0x43f00000 ; 2^64
+ .word 0x0
+ .export __gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+ ldo 64(%r30),%r30
+
+ stws %r25,-16(0,%r30) ; n_hi
+ stws %r24,-12(0,%r30) ; n_lo
+#ifdef PIC
+ addil LT%L$0000,%r19
+ ldo RT%L$0000(%r1),%r19
+#else
+ ldil L%L$0000,%r19
+ ldo R%L$0000(%r19),%r19
+#endif
+ fldds -16(0,%r30),%fr5
+ stws %r23,-12(0,%r30)
+ comib,<= 0,%r25,L$1
+ fcnvxf,dbl,dbl %fr5,%fr5
+ fldds 0(0,%r19),%fr4
+ fadd,dbl %fr4,%fr5,%fr5
+L$1
+ fcpy,sgl %fr0,%fr6L
+ fldws -12(0,%r30),%fr6R
+ fcnvxf,dbl,dbl %fr6,%fr4
+
+ fdiv,dbl %fr5,%fr4,%fr5
+
+ fcnvfx,dbl,dbl %fr5,%fr4
+ fstws %fr4R,-16(%r30)
+ xmpyu %fr4R,%fr6R,%fr6
+ ldws -16(%r30),%r28
+ fstds %fr6,-16(0,%r30)
+ ldws -12(0,%r30),%r21
+ ldws -16(0,%r30),%r20
+ sub %r24,%r21,%r22
+ subb %r25,%r20,%r19
+ comib,= 0,%r19,L$2
+ ldo -64(%r30),%r30
+
+ add %r22,%r23,%r22
+ ldo -1(%r28),%r28
+L$2 bv 0(%r2)
+ stws %r22,0(0,%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s
new file mode 100644
index 0000000000..1f1300ac9b
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s
@@ -0,0 +1,42 @@
+; Copyright (C) 1999 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+ .code
+ .export __umul_ppmm
+ .align 4
+__umul_ppmm
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+ stw %r25,-16(0,%r30)
+ fldws -16(0,%r30),%fr22R
+ stw %r24,-16(0,%r30)
+ fldws -16(0,%r30),%fr22L
+ xmpyu %fr22R,%fr22L,%fr22
+ fstds %fr22,-16(0,%r30)
+ ldw -16(0,%r30),%r28
+ ldw -12(0,%r30),%r29
+ stw %r29,0(0,%r26)
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
new file mode 100644
index 0000000000..6e97278a39
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0
+; and store sum in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .code
+ .export __gmpn_add_n
+__gmpn_add_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ sub %r0,%r23,%r22
+ zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ sub %r26,%r22,%r26 ; offset res_ptr
+ blr %r28,%r0 ; branch into loop
+ add %r0,%r0,%r0 ; reset carry
+
+L$loop ldw 0(%r25),%r20
+ ldw 0(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,0(%r26)
+L$7 ldw 4(%r25),%r21
+ ldw 4(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,4(%r26)
+L$6 ldw 8(%r25),%r20
+ ldw 8(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,8(%r26)
+L$5 ldw 12(%r25),%r21
+ ldw 12(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,12(%r26)
+L$4 ldw 16(%r25),%r20
+ ldw 16(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,16(%r26)
+L$3 ldw 20(%r25),%r21
+ ldw 20(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,20(%r26)
+L$2 ldw 24(%r25),%r20
+ ldw 24(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,24(%r26)
+L$1 ldw 28(%r25),%r21
+ ldo 32(%r25),%r25
+ ldw 28(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,28(%r26)
+ ldo 32(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 32(%r26),%r26
+
+ bv (%r2)
+ .exit
+ addc %r0,%r0,%r28
+ .procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
new file mode 100644
index 0000000000..7d9b50fc27
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same
+; length > 0 and store difference in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .code
+ .export __gmpn_sub_n
+__gmpn_sub_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ sub %r0,%r23,%r22
+ zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ blr %r28,%r0 ; branch into loop
+ sub %r26,%r22,%r26 ; offset res_ptr and set carry
+
+L$loop ldw 0(%r25),%r20
+ ldw 0(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,0(%r26)
+L$7 ldw 4(%r25),%r21
+ ldw 4(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,4(%r26)
+L$6 ldw 8(%r25),%r20
+ ldw 8(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,8(%r26)
+L$5 ldw 12(%r25),%r21
+ ldw 12(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,12(%r26)
+L$4 ldw 16(%r25),%r20
+ ldw 16(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,16(%r26)
+L$3 ldw 20(%r25),%r21
+ ldw 20(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,20(%r26)
+L$2 ldw 24(%r25),%r20
+ ldw 24(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,24(%r26)
+L$1 ldw 28(%r25),%r21
+ ldo 32(%r25),%r25
+ ldw 28(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,28(%r26)
+ ldo 32(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 32(%r26),%r26
+
+ addc %r0,%r0,%r28
+ bv (%r2)
+ .exit
+ subi 1,%r28,%r28
+ .procend
diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s
new file mode 100644
index 0000000000..f5a2daad60
--- /dev/null
+++ b/rts/gmp/mpn/hppa/lshift.s
@@ -0,0 +1,66 @@
+; HP-PA __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __gmpn_lshift
+__gmpn_lshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L$0004
+ vshd %r0,%r22,%r28 ; compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,= -1,%r24,L$0002
+ vshd %r22,%r29,%r20
+
+L$loop ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,= -1,%r24,L$0003
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,<> -1,%r24,L$loop
+ vshd %r22,%r29,%r20
+
+L$0002 stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+L$0003 stws,mb %r20,-4(0,%r26)
+L$0004 vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s
new file mode 100644
index 0000000000..e05e2f10b5
--- /dev/null
+++ b/rts/gmp/mpn/hppa/rshift.s
@@ -0,0 +1,63 @@
+; HP-PA __gmpn_rshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s_ptr gr25
+; size gr24
+; cnt gr23
+
+ .code
+ .export __gmpn_rshift
+__gmpn_rshift
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L$0004
+ vshd %r22,%r0,%r28 ; compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,= -1,%r24,L$0002
+ vshd %r29,%r22,%r20
+
+L$loop ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,= -1,%r24,L$0003
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,<> -1,%r24,L$loop
+ vshd %r29,%r22,%r20
+
+L$0002 stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+L$0003 stws,ma %r20,4(0,%r26)
+L$0004 vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s
new file mode 100644
index 0000000000..8f770ad1ad
--- /dev/null
+++ b/rts/gmp/mpn/hppa/sub_n.s
@@ -0,0 +1,59 @@
+; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless. We can't come under 5 cycles/limb anyway.
+
+ .code
+ .export __gmpn_sub_n
+__gmpn_sub_n
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,= -1,%r23,L$end ; check for (SIZE == 1)
+ sub %r20,%r19,%r28 ; subtract first limbs ignoring cy
+
+L$loop ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,<> -1,%r23,L$loop
+ subb %r20,%r19,%r28
+
+L$end stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..9aa3b8a830
--- /dev/null
+++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s
@@ -0,0 +1,286 @@
+; HP-PA __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr gr26
+; n1 gr25
+; n0 gr24
+; d gr23
+
+; The code size is a bit excessive. We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down. The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+ .code
+ .export __gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+ .proc
+ .callinfo frame=0,no_calls
+ .entry
+
+ comb,< %r23,0,L$largedivisor
+ sub %r0,%r23,%r1 ; clear cy as side-effect
+ ds %r0,%r1,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r28
+ ds %r25,%r23,%r25
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r23,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r28,%r28,%r28
+
+L$largedivisor
+ extru %r24,31,1,%r19 ; r19 = n0 & 1
+ bb,< %r23,31,L$odd
+ extru %r23,30,31,%r22 ; r22 = d >> 1
+ shd %r25,%r24,1,%r24 ; r24 = new n0
+ extru %r25,30,31,%r25 ; r25 = new n1
+ sub %r0,%r22,%r21
+ ds %r0,%r21,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r22,%r25
+ sh1addl %r25,%r19,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r24,%r24,%r28
+
+L$odd addib,sv,n 1,%r22,L$FF.. ; r22 = (d / 2 + 1)
+ shd %r25,%r24,1,%r24 ; r24 = new n0
+ extru %r25,30,31,%r25 ; r25 = new n1
+ sub %r0,%r22,%r21
+ ds %r0,%r21,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r28
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r22,%r25
+ sh1addl %r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+ add,nuv %r28,%r25,%r25
+ addl %r25,%r1,%r25
+ addc %r0,%r28,%r28
+ sub,<< %r25,%r23,%r0
+ addl %r25,%r1,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF.. add,uv %r25,%r24,%r24
+ sub,<< %r24,%r23,%r0
+ ldo 1(%r24),%r24
+ stws %r24,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r25,%r28
+
+ .exit
+ .procend
diff --git a/rts/gmp/mpn/i960/README b/rts/gmp/mpn/i960/README
new file mode 100644
index 0000000000..d68a0a83eb
--- /dev/null
+++ b/rts/gmp/mpn/i960/README
@@ -0,0 +1,9 @@
+This directory contains mpn functions for Intel i960 processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The code in this directory is not well optimized.
+
+STATUS
+
+The code in this directory has not been tested.
diff --git a/rts/gmp/mpn/i960/add_n.s b/rts/gmp/mpn/i960/add_n.s
new file mode 100644
index 0000000000..387317a397
--- /dev/null
+++ b/rts/gmp/mpn/i960/add_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 4
+ .globl ___gmpn_add_n
+___gmpn_add_n:
+ mov 0,g6 # clear carry-save register
+ cmpo 1,0 # clear cy
+
+Loop: subo 1,g3,g3 # update loop counter
+ ld (g1),g5 # load from s1_ptr
+ addo 4,g1,g1 # s1_ptr++
+ ld (g2),g4 # load from s2_ptr
+ addo 4,g2,g2 # s2_ptr++
+ cmpo g6,1 # restore cy from g6, relies on cy being 0
+ addc g4,g5,g4 # main add
+ subc 0,0,g6 # save cy in g6
+ st g4,(g0) # store result to res_ptr
+ addo 4,g0,g0 # res_ptr++
+ cmpobne 0,g3,Loop # when branch is taken, clears C bit
+
+ mov g6,g0
+ ret
diff --git a/rts/gmp/mpn/i960/addmul_1.s b/rts/gmp/mpn/i960/addmul_1.s
new file mode 100644
index 0000000000..7df1418356
--- /dev/null
+++ b/rts/gmp/mpn/i960/addmul_1.s
@@ -0,0 +1,48 @@
+# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 4
+ .globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subo g2,0,g2
+ shlo 2,g2,g4
+ subo g4,g1,g1
+ subo g4,g0,g13
+ mov 0,g0
+
+ cmpo 1,0 # clear C bit on AC.cc
+
+Loop: ld (g1)[g2*4],g5
+ emul g3,g5,g6
+ ld (g13)[g2*4],g5
+
+ addc g0,g6,g6 # relies on that C bit is clear
+ addc 0,g7,g7
+ addc g5,g6,g6 # relies on that C bit is clear
+ st g6,(g13)[g2*4]
+ addc 0,g7,g0
+
+ addo g2,1,g2
+ cmpobne 0,g2,Loop # when branch is taken, clears C bit
+
+ ret
diff --git a/rts/gmp/mpn/i960/mul_1.s b/rts/gmp/mpn/i960/mul_1.s
new file mode 100644
index 0000000000..5c0c985aa5
--- /dev/null
+++ b/rts/gmp/mpn/i960/mul_1.s
@@ -0,0 +1,45 @@
+# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 4
+ .globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subo g2,0,g2
+ shlo 2,g2,g4
+ subo g4,g1,g1
+ subo g4,g0,g13
+ mov 0,g0
+
+ cmpo 1,0 # clear C bit on AC.cc
+
+Loop: ld (g1)[g2*4],g5
+ emul g3,g5,g6
+
+ addc g0,g6,g6 # relies on that C bit is clear
+ st g6,(g13)[g2*4]
+ addc 0,g7,g0
+
+ addo g2,1,g2
+ cmpobne 0,g2,Loop # when branch is taken, clears C bit
+
+ ret
diff --git a/rts/gmp/mpn/i960/sub_n.s b/rts/gmp/mpn/i960/sub_n.s
new file mode 100644
index 0000000000..2db2d46aad
--- /dev/null
+++ b/rts/gmp/mpn/i960/sub_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 4
+ .globl ___gmpn_sub_n
+___gmpn_sub_n:
+ mov 1,g6 # set carry-save register
+ cmpo 1,0 # clear cy
+
+Loop: subo 1,g3,g3 # update loop counter
+ ld (g1),g5 # load from s1_ptr
+ addo 4,g1,g1 # s1_ptr++
+ ld (g2),g4 # load from s2_ptr
+ addo 4,g2,g2 # s2_ptr++
+ cmpo g6,1 # restore cy from g6, relies on cy being 0
+ subc g4,g5,g4 # main subtract
+ subc 0,0,g6 # save cy in g6
+ st g4,(g0) # store result to res_ptr
+ addo 4,g0,g0 # res_ptr++
+ cmpobne 0,g3,Loop # when branch is taken, cy will be 0
+
+ mov g6,g0
+ ret
diff --git a/rts/gmp/mpn/lisp/gmpasm-mode.el b/rts/gmp/mpn/lisp/gmpasm-mode.el
new file mode 100644
index 0000000000..5d9da7fa1f
--- /dev/null
+++ b/rts/gmp/mpn/lisp/gmpasm-mode.el
@@ -0,0 +1,351 @@
+;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
+
+
+;; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+;;
+;; This file is part of the GNU MP Library.
+;;
+;; The GNU MP Library is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU Lesser General Public License as published by
+;; the Free Software Foundation; either version 2.1 of the License, or (at your
+;; option) any later version.
+;;
+;; The GNU MP Library is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU Lesser General Public License
+;; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+;; MA 02111-1307, USA.
+
+
+;;; Commentary:
+;;
+;; gmpasm-mode is an editing mode for m4 processed assembler code and m4
+;; macro files in GMP. It's similar to m4-mode, but has a number of
+;; settings better suited to GMP.
+;;
+;;
+;; Install
+;; -------
+;;
+;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the
+;; load-path and the following in .emacs
+;;
+;; (autoload 'gmpasm-mode "gmpasm-mode" nil t)
+;;
+;; To use gmpasm-mode automatically on all .asm and .m4 files, put the
+;; following in .emacs
+;;
+;; (add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode))
+;; (add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode))
+;;
+;; To have gmpasm-mode only on gmp files, try instead something like the
+;; following, which uses it only in a directory starting with "gmp", or a
+;; sub-directory of such.
+;;
+;; (add-to-list 'auto-mode-alist
+;; '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode))
+;;
+;; Byte compiling will slightly speed up loading. If you want a docstring
+;; in the autoload you can use M-x update-file-autoloads if you set it up
+;; right.
+;;
+;;
+;; Emacsen
+;; -------
+;;
+;; FSF Emacs 20.x - gmpasm-mode is designed for this.
+;; XEmacs 20.x - seems to work.
+;;
+;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are
+;; available. comment-region with "C" won't really do the right thing
+;; though.
+
+
+;;; Code:
+
+(defgroup gmpasm nil
+ "GNU MP m4 and asm editing."
+ :prefix "gmpasm-"
+ :group 'languages)
+
+(defcustom gmpasm-mode-hook nil
+ "*Hook called by `gmpasm-mode'."
+ :type 'hook
+ :group 'gmpasm)
+
+(defcustom gmpasm-comment-start-regexp "[#;!@C]"
+ "*Regexp matching possible comment styles.
+See `gmpasm-mode' docstring for how this is used."
+ :type 'regexp
+ :group 'gmpasm)
+
+
+(defun gmpasm-add-to-list-second (list-var element)
+ "(gmpasm-add-to-list-second LIST-VAR ELEMENT)
+
+Add ELEMENT to LIST-VAR as the second element in the list, if it isn't
+already in the list. If LIST-VAR is nil, then ELEMENT is just added as the
+sole element in the list.
+
+This is like `add-to-list', but it puts the new value second in the list.
+
+The first cons cell is copied rather than changed in-place, so references to
+the list elsewhere won't be affected."
+
+ (if (member element (symbol-value list-var))
+ (symbol-value list-var)
+ (set list-var
+ (if (symbol-value list-var)
+ (cons (car (symbol-value list-var))
+ (cons element
+ (cdr (symbol-value list-var))))
+ (list element)))))
+
+
+(defun gmpasm-delete-from-list (list-var element)
+ "(gmpasm-delete-from-list LIST-VAR ELEMENT)
+
+Delete ELEMENT from LIST-VAR, using `delete'.
+This is like `add-to-list', but the element is deleted from the list.
+The list is copied rather than changed in-place, so references to it elsewhere
+won't be affected."
+
+ (set list-var (delete element (copy-sequence (symbol-value list-var)))))
+
+
+(defvar gmpasm-mode-map
+ (let ((map (make-sparse-keymap)))
+
+ ;; assembler and dnl commenting
+ (define-key map "\C-c\C-c" 'comment-region)
+ (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl)
+
+ ;; kill an M-x compile, since it's not hard to put m4 into an infinite
+ ;; loop
+ (define-key map "\C-c\C-k" 'kill-compilation)
+
+ map)
+ "Keymap for `gmpasm-mode'.")
+
+
+(defvar gmpasm-mode-syntax-table
+ (let ((table (make-syntax-table)))
+ ;; underscore left as a symbol char, like C mode
+
+ ;; m4 quotes
+ (modify-syntax-entry ?` "('" table)
+ (modify-syntax-entry ?' ")`" table)
+
+ table)
+ "Syntax table used in `gmpasm-mode'.
+
+m4 ignores quote marks in # comments at the top level, but inside quotes #
+isn't special and all quotes are active. There seems no easy way to express
+this in the syntax table, so nothing is done for comments. Usually this is
+best, since it picks up invalid apostrophes in comments inside quotes.")
+
+
+(defvar gmpasm-font-lock-keywords
+ (eval-when-compile
+ (list
+ (cons
+ (concat
+ "\\b"
+ (regexp-opt
+ '("deflit" "defreg" "defframe" "defframe_pushl"
+ "define_not_for_expansion"
+ "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE"
+ "forloop"
+ "TEXT" "DATA" "ALIGN" "W32"
+ "builtin" "changecom" "changequote" "changeword" "debugfile"
+ "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef"
+ "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef"
+ "ifelse" "include" "incr" "index" "indir" "len" "__line__"
+ "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef"
+ "regexp" "shift" "sinclude" "substr" "syscmd" "sysval"
+ "traceoff" "traceon" "translit" "undefine" "undivert" "unix")
+ t)
+ "\\b") 'font-lock-keyword-face)))
+
+ "`font-lock-keywords' for `gmpasm-mode'.
+
+The keywords are m4 builtins and some of the GMP macros used in asm files.
+L and LF don't look good fontified, so they're omitted.
+
+The right assembler comment regexp is added dynamically buffer-local (with
+dnl too).")
+
+
+;; Initialized if gmpasm-mode finds filladapt loaded.
+(defvar gmpasm-filladapt-token-table nil
+ "Filladapt token table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-match-table nil
+ "Filladapt token match table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-conversion-table nil
+ "Filladapt token conversion table used in `gmpasm-mode'.")
+
+
+;;;###autoload
+(defun gmpasm-mode ()
+ "A major mode for editing GNU MP asm and m4 files.
+
+\\{gmpasm-mode-map}
+`comment-start' and `comment-end' are set buffer-local to assembler
+commenting appropriate for the CPU by looking for something matching
+`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if
+there's no match (if \"#\" isn't what you want, type in a desired comment
+and do \\[gmpasm-mode] to reinitialize).
+
+`adaptive-fill-regexp' is set buffer-local to the standard regexp with
+`comment-start' and dnl added. If filladapt.el has been loaded it similarly
+gets `comment-start' and dnl added as buffer-local fill prefixes.
+
+Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting,
+and assembler commenting (based on the `comment-start' determined).
+
+Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so
+the `C' in it is only matched as a whole word, not on something that happens
+to start with `C'. Also it's only the particular `comment-start' determined
+that's added for filling etc, not the whole `gmpasm-comment-start-regexp'.
+
+`gmpasm-mode-hook' is run after initializations are complete.
+"
+
+ (interactive)
+ (kill-all-local-variables)
+ (setq major-mode 'gmpasm-mode
+ mode-name "gmpasm")
+ (use-local-map gmpasm-mode-map)
+ (set-syntax-table gmpasm-mode-syntax-table)
+ (setq fill-column 76)
+
+ ;; Short instructions might fit with 32, but anything with labels or
+ ;; expressions soon needs the comments pushed out to column 40.
+ (setq comment-column 40)
+
+ ;; Don't want to find out the hard way which dumb assemblers don't like a
+ ;; missing final newline.
+ (set (make-local-variable 'require-final-newline) t)
+
+ ;; The first match of gmpasm-comment-start-regexp at the start of a line
+ ;; determines comment-start, or "#" if no match.
+ (set (make-local-variable 'comment-start)
+ (save-excursion
+ (goto-char (point-min))
+ (if (re-search-forward
+ (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)")
+ nil t)
+ (match-string 1)
+ "#")))
+ (set (make-local-variable 'comment-end) "")
+
+ ;; If comment-start ends in an alphanumeric then \b is used to match it
+ ;; only as a separate word. The test is for an alphanumeric rather than
+ ;; \w since we might try # or ! as \w characters but without wanting \b.
+ (let ((comment-regexp
+ (concat (regexp-quote comment-start)
+ (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b"))))
+
+ ;; Whitespace is required before a comment-start so m4 $# doesn't match
+ ;; when comment-start is "#".
+ ;; Only spaces or tabs match after, so newline isn't included in the
+ ;; font lock below.
+ (set (make-local-variable 'comment-start-skip)
+ (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*"))
+
+ ;; Comment fontification based on comment-start, matching through to the
+ ;; end of the line.
+ (add-to-list (make-local-variable 'gmpasm-font-lock-keywords)
+ (cons (concat
+ "\\(\\bdnl\\b\\|" comment-start-skip "\\).*$")
+ 'font-lock-comment-face))
+
+ (set (make-local-variable 'font-lock-defaults)
+ '(gmpasm-font-lock-keywords
+ t ; no syntactic fontification (of strings etc)
+ nil ; no case-fold
+ ((?_ . "w")) ; _ part of a word while fontifying
+ ))
+
+ ;; Paragraphs are separated by blank lines, or lines with only dnl or
+ ;; comment-start.
+ (set (make-local-variable 'paragraph-separate)
+ (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$"))
+ (set (make-local-variable 'paragraph-start)
+ (concat "\f\\|" paragraph-separate))
+
+ ;; Adaptive fill gets dnl and comment-start as comment style prefixes on
+ ;; top of the standard regexp (which has # and ; already actually).
+ (set (make-local-variable 'adaptive-fill-regexp)
+ (concat "[ \t]*\\(\\("
+ comment-regexp
+ "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*"))
+ (set (make-local-variable 'adaptive-fill-first-line-regexp)
+ "\\`\\([ \t]*dnl\\)?[ \t]*\\'")
+
+ (when (fboundp 'filladapt-mode)
+ (when (not gmpasm-filladapt-token-table)
+ (setq gmpasm-filladapt-token-table
+ filladapt-token-table)
+ (setq gmpasm-filladapt-token-match-table
+ filladapt-token-match-table)
+ (setq gmpasm-filladapt-token-conversion-table
+ filladapt-token-conversion-table)
+
+ ;; Numbered bullet points like "2.1" get matched at the start of a
+ ;; line when it's really something like "2.1 cycles/limb", so delete
+ ;; this from the list. The regexp for "1.", "2." etc is left
+ ;; though.
+ (gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+ '("[0-9]+\\(\\.[0-9]+\\)+[ \t]"
+ bullet))
+
+ ;; "%" as a comment prefix interferes with x86 register names
+ ;; like %eax, so delete this.
+ (gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+ '("%+" postscript-comment))
+
+ (add-to-list 'gmpasm-filladapt-token-match-table
+ '(gmpasm-comment gmpasm-comment))
+ (add-to-list 'gmpasm-filladapt-token-conversion-table
+ '(gmpasm-comment . exact))
+ )
+
+ (set (make-local-variable 'filladapt-token-table)
+ gmpasm-filladapt-token-table)
+ (set (make-local-variable 'filladapt-token-match-table)
+ gmpasm-filladapt-token-match-table)
+ (set (make-local-variable 'filladapt-token-conversion-table)
+ gmpasm-filladapt-token-conversion-table)
+
+ ;; Add dnl and comment-start as fill prefixes.
+ ;; Comments in filladapt.el say filladapt-token-table must begin
+ ;; with ("^" beginning-of-line), so put our addition second.
+ (gmpasm-add-to-list-second 'filladapt-token-table
+ (list (concat "dnl[ \t]\\|" comment-regexp)
+ 'gmpasm-comment))
+ ))
+
+ (run-hooks 'gmpasm-mode-hook))
+
+
+(defun gmpasm-comment-region-dnl (beg end &optional arg)
+ "(gmpasm-comment-region BEG END &option ARG)
+
+Comment or uncomment each line in the region using `dnl'.
+With \\[universal-argument] prefix arg, uncomment each line in region.
+This is `comment-region', but using \"dnl\"."
+
+ (interactive "r\nP")
+ (let ((comment-start "dnl")
+ (comment-end ""))
+ (comment-region beg end arg)))
+
+
+(provide 'gmpasm-mode)
+
+;;; gmpasm-mode.el ends here
diff --git a/rts/gmp/mpn/m68k/add_n.S b/rts/gmp/mpn/m68k/add_n.S
new file mode 100644
index 0000000000..9e1d89d64f
--- /dev/null
+++ b/rts/gmp/mpn/m68k/add_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+ sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s1_ptr (sp + 8)
+ s2_ptr (sp + 16)
+ size (sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_add_n)
+
+C_SYMBOL_NAME(__gmpn_add_n:)
+PROLOG(__gmpn_add_n)
+/* Save used registers on the stack. */
+ movel R(d2),MEM_PREDEC(sp)
+ movel R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. Better use movem? */
+ movel MEM_DISP(sp,12),R(a2)
+ movel MEM_DISP(sp,16),R(a0)
+ movel MEM_DISP(sp,20),R(a1)
+ movel MEM_DISP(sp,24),R(d2)
+
+ eorw #1,R(d2)
+ lsrl #1,R(d2)
+ bcc L(L1)
+ subql #1,R(d2) /* clears cy as side effect */
+
+L(Loop:)
+ movel MEM_POSTINC(a0),R(d0)
+ movel MEM_POSTINC(a1),R(d1)
+ addxl R(d1),R(d0)
+ movel R(d0),MEM_POSTINC(a2)
+L(L1:) movel MEM_POSTINC(a0),R(d0)
+ movel MEM_POSTINC(a1),R(d1)
+ addxl R(d1),R(d0)
+ movel R(d0),MEM_POSTINC(a2)
+
+ dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */
+ subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */
+ subl #0x10000,R(d2)
+ bcs L(L2)
+ addl R(d0),R(d0) /* restore cy */
+ bra L(Loop)
+
+L(L2:)
+ negl R(d0)
+
+/* Restore used registers from stack frame. */
+ movel MEM_POSTINC(sp),R(a2)
+ movel MEM_POSTINC(sp),R(d2)
+
+ rts
+EPILOG(__gmpn_add_n)
diff --git a/rts/gmp/mpn/m68k/lshift.S b/rts/gmp/mpn/m68k/lshift.S
new file mode 100644
index 0000000000..a539d5d42e
--- /dev/null
+++ b/rts/gmp/mpn/m68k/lshift.S
@@ -0,0 +1,150 @@
+/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s_ptr (sp + 8)
+ s_size (sp + 16)
+ cnt (sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_lshift)
+
+C_SYMBOL_NAME(__gmpn_lshift:)
+PROLOG(__gmpn_lshift)
+
+/* Save used registers on the stack. */
+ moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. */
+ movel MEM_DISP(sp,28),R(res_ptr)
+ movel MEM_DISP(sp,32),R(s_ptr)
+ movel MEM_DISP(sp,36),R(s_size)
+ movel MEM_DISP(sp,40),R(cnt)
+
+ moveql #1,R(d5)
+ cmpl R(d5),R(cnt)
+ bne L(Lnormal)
+ cmpl R(s_ptr),R(res_ptr)
+ bls L(Lspecial) /* jump if s_ptr >= res_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+ lea MEM_INDX1(s_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+ movel R(s_size),R(d0)
+ asll #2,R(d0)
+ lea MEM_INDX(s_ptr,d0,l),R(a2)
+#endif
+ cmpl R(res_ptr),R(a2)
+ bls L(Lspecial) /* jump if res_ptr >= s_ptr + s_size */
+
+L(Lnormal:)
+ moveql #32,R(d5)
+ subl R(cnt),R(d5)
+
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+ lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+ lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+ movel R(s_size),R(d0)
+ asll #2,R(d0)
+ addl R(s_size),R(s_ptr)
+ addl R(s_size),R(res_ptr)
+#endif
+ movel MEM_PREDEC(s_ptr),R(d2)
+ movel R(d2),R(d0)
+ lsrl R(d5),R(d0) /* compute carry limb */
+
+ lsll R(cnt),R(d2)
+ movel R(d2),R(d1)
+ subql #1,R(s_size)
+ beq L(Lend)
+ lsrl #1,R(s_size)
+ bcs L(L1)
+ subql #1,R(s_size)
+
+L(Loop:)
+ movel MEM_PREDEC(s_ptr),R(d2)
+ movel R(d2),R(d3)
+ lsrl R(d5),R(d3)
+ orl R(d3),R(d1)
+ movel R(d1),MEM_PREDEC(res_ptr)
+ lsll R(cnt),R(d2)
+L(L1:)
+ movel MEM_PREDEC(s_ptr),R(d1)
+ movel R(d1),R(d3)
+ lsrl R(d5),R(d3)
+ orl R(d3),R(d2)
+ movel R(d2),MEM_PREDEC(res_ptr)
+ lsll R(cnt),R(d1)
+
+ dbf R(s_size),L(Loop)
+ subl #0x10000,R(s_size)
+ bcc L(Loop)
+
+L(Lend:)
+ movel R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
+
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+ rts
+
+/* We loop from least significant end of the arrays, which is only
+ permissable if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination. */
+
+L(Lspecial:)
+ clrl R(d0) /* initialize carry */
+ eorw #1,R(s_size)
+ lsrl #1,R(s_size)
+ bcc L(LL1)
+ subql #1,R(s_size)
+
+L(LLoop:)
+ movel MEM_POSTINC(s_ptr),R(d2)
+ addxl R(d2),R(d2)
+ movel R(d2),MEM_POSTINC(res_ptr)
+L(LL1:)
+ movel MEM_POSTINC(s_ptr),R(d2)
+ addxl R(d2),R(d2)
+ movel R(d2),MEM_POSTINC(res_ptr)
+
+ dbf R(s_size),L(LLoop)
+ addxl R(d0),R(d0) /* save cy in lsb */
+ subl #0x10000,R(s_size)
+ bcs L(LLend)
+ lsrl #1,R(d0) /* restore cy */
+ bra L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+ rts
+EPILOG(__gmpn_lshift)
diff --git a/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
new file mode 100644
index 0000000000..6638115d71
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+ the result to a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s1_ptr (sp + 8)
+ s1_size (sp + 12)
+ s2_limb (sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_addmul_1)
+
+C_SYMBOL_NAME(__gmpn_addmul_1:)
+PROLOG(__gmpn_addmul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack. */
+ moveml R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. Better use movem? */
+ movel MEM_DISP(sp,20),R(res_ptr)
+ movel MEM_DISP(sp,24),R(s1_ptr)
+ movel MEM_DISP(sp,28),R(s1_size)
+ movel MEM_DISP(sp,32),R(s2_limb)
+
+ eorw #1,R(s1_size)
+ clrl R(d1)
+ clrl R(d5)
+ lsrl #1,R(s1_size)
+ bcc L(L1)
+ subql #1,R(s1_size)
+ subl R(d0),R(d0) /* (d0,cy) <= (0,0) */
+
+L(Loop:)
+ movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d1):R(d3)
+ addxl R(d0),R(d3)
+ addxl R(d5),R(d1)
+ addl R(d3),MEM_POSTINC(res_ptr)
+L(L1:) movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d0):R(d3)
+ addxl R(d1),R(d3)
+ addxl R(d5),R(d0)
+ addl R(d3),MEM_POSTINC(res_ptr)
+
+ dbf R(s1_size),L(Loop)
+ addxl R(d5),R(d0)
+ subl #0x10000,R(s1_size)
+ bcc L(Loop)
+
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d5)
+
+ rts
+EPILOG(__gmpn_addmul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/mul_1.S b/rts/gmp/mpn/m68k/mc68020/mul_1.S
new file mode 100644
index 0000000000..fdd4c39d70
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/mul_1.S
@@ -0,0 +1,90 @@
+/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+ the result in a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s1_ptr (sp + 8)
+ s1_size (sp + 12)
+ s2_limb (sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_mul_1)
+
+C_SYMBOL_NAME(__gmpn_mul_1:)
+PROLOG(__gmpn_mul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack. */
+ moveml R(d2)-R(d4),MEM_PREDEC(sp)
+#if 0
+ movel R(d2),MEM_PREDEC(sp)
+ movel R(d3),MEM_PREDEC(sp)
+ movel R(d4),MEM_PREDEC(sp)
+#endif
+
+/* Copy the arguments to registers. Better use movem? */
+ movel MEM_DISP(sp,16),R(res_ptr)
+ movel MEM_DISP(sp,20),R(s1_ptr)
+ movel MEM_DISP(sp,24),R(s1_size)
+ movel MEM_DISP(sp,28),R(s2_limb)
+
+ eorw #1,R(s1_size)
+ clrl R(d1)
+ lsrl #1,R(s1_size)
+ bcc L(L1)
+ subql #1,R(s1_size)
+ subl R(d0),R(d0) /* (d0,cy) <= (0,0) */
+
+L(Loop:)
+ movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d1):R(d3)
+ addxl R(d0),R(d3)
+ movel R(d3),MEM_POSTINC(res_ptr)
+L(L1:) movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d0):R(d3)
+ addxl R(d1),R(d3)
+ movel R(d3),MEM_POSTINC(res_ptr)
+
+ dbf R(s1_size),L(Loop)
+ clrl R(d3)
+ addxl R(d3),R(d0)
+ subl #0x10000,R(s1_size)
+ bcc L(Loop)
+
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d4)
+#if 0
+ movel MEM_POSTINC(sp),R(d4)
+ movel MEM_POSTINC(sp),R(d3)
+ movel MEM_POSTINC(sp),R(d2)
+#endif
+ rts
+EPILOG(__gmpn_mul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/submul_1.S b/rts/gmp/mpn/m68k/mc68020/submul_1.S
new file mode 100644
index 0000000000..3c36b70166
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/submul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+ the result from a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s1_ptr (sp + 8)
+ s1_size (sp + 12)
+ s2_limb (sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_submul_1)
+
+C_SYMBOL_NAME(__gmpn_submul_1:)
+PROLOG(__gmpn_submul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack. */
+ moveml R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. Better use movem? */
+ movel MEM_DISP(sp,20),R(res_ptr)
+ movel MEM_DISP(sp,24),R(s1_ptr)
+ movel MEM_DISP(sp,28),R(s1_size)
+ movel MEM_DISP(sp,32),R(s2_limb)
+
+ eorw #1,R(s1_size)
+ clrl R(d1)
+ clrl R(d5)
+ lsrl #1,R(s1_size)
+ bcc L(L1)
+ subql #1,R(s1_size)
+ subl R(d0),R(d0) /* (d0,cy) <= (0,0) */
+
+L(Loop:)
+ movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d1):R(d3)
+ addxl R(d0),R(d3)
+ addxl R(d5),R(d1)
+ subl R(d3),MEM_POSTINC(res_ptr)
+L(L1:) movel MEM_POSTINC(s1_ptr),R(d3)
+ mulul R(s2_limb),R(d0):R(d3)
+ addxl R(d1),R(d3)
+ addxl R(d5),R(d0)
+ subl R(d3),MEM_POSTINC(res_ptr)
+
+ dbf R(s1_size),L(Loop)
+ addxl R(d5),R(d0)
+ subl #0x10000,R(s1_size)
+ bcc L(Loop)
+
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d5)
+
+ rts
+EPILOG(__gmpn_submul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/udiv.S b/rts/gmp/mpn/m68k/mc68020/udiv.S
new file mode 100644
index 0000000000..d00cf13558
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/udiv.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+ .even
+.globl ___udiv_qrnnd
+___udiv_qrnnd:
+ movel sp@(4),a0
+ movel sp@(8),d1
+ movel sp@(12),d0
+ divul sp@(16),d1:d0
+ movel d1,a0@
+ rts
diff --git a/rts/gmp/mpn/m68k/mc68020/umul.S b/rts/gmp/mpn/m68k/mc68020/umul.S
new file mode 100644
index 0000000000..a34ae6c543
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/umul.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+ .even
+.globl ___umul_ppmm
+___umul_ppmm:
+ movel sp@(4),a0
+ movel sp@(8),d1
+ movel sp@(12),d0
+ mulul d0,d0:d1
+ movel d1,a0@
+ rts
diff --git a/rts/gmp/mpn/m68k/rshift.S b/rts/gmp/mpn/m68k/rshift.S
new file mode 100644
index 0000000000..b47a48e52a
--- /dev/null
+++ b/rts/gmp/mpn/m68k/rshift.S
@@ -0,0 +1,149 @@
+/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s_ptr (sp + 8)
+ s_size (sp + 16)
+ cnt (sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_rshift)
+
+C_SYMBOL_NAME(__gmpn_rshift:)
+PROLOG(__gmpn_rshift)
+/* Save used registers on the stack. */
+ moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. */
+ movel MEM_DISP(sp,28),R(res_ptr)
+ movel MEM_DISP(sp,32),R(s_ptr)
+ movel MEM_DISP(sp,36),R(s_size)
+ movel MEM_DISP(sp,40),R(cnt)
+
+ moveql #1,R(d5)
+ cmpl R(d5),R(cnt)
+ bne L(Lnormal)
+ cmpl R(res_ptr),R(s_ptr)
+ bls L(Lspecial) /* jump if res_ptr >= s_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+ lea MEM_INDX1(res_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+ movel R(s_size),R(d0)
+ asll #2,R(d0)
+ lea MEM_INDX(res_ptr,d0,l),R(a2)
+#endif
+ cmpl R(s_ptr),R(a2)
+ bls L(Lspecial) /* jump if s_ptr >= res_ptr + s_size */
+
+L(Lnormal:)
+ moveql #32,R(d5)
+ subl R(cnt),R(d5)
+ movel MEM_POSTINC(s_ptr),R(d2)
+ movel R(d2),R(d0)
+ lsll R(d5),R(d0) /* compute carry limb */
+
+ lsrl R(cnt),R(d2)
+ movel R(d2),R(d1)
+ subql #1,R(s_size)
+ beq L(Lend)
+ lsrl #1,R(s_size)
+ bcs L(L1)
+ subql #1,R(s_size)
+
+L(Loop:)
+ movel MEM_POSTINC(s_ptr),R(d2)
+ movel R(d2),R(d3)
+ lsll R(d5),R(d3)
+ orl R(d3),R(d1)
+ movel R(d1),MEM_POSTINC(res_ptr)
+ lsrl R(cnt),R(d2)
+L(L1:)
+ movel MEM_POSTINC(s_ptr),R(d1)
+ movel R(d1),R(d3)
+ lsll R(d5),R(d3)
+ orl R(d3),R(d2)
+ movel R(d2),MEM_POSTINC(res_ptr)
+ lsrl R(cnt),R(d1)
+
+ dbf R(s_size),L(Loop)
+ subl #0x10000,R(s_size)
+ bcc L(Loop)
+
+L(Lend:)
+ movel R(d1),MEM(res_ptr) /* store most significant limb */
+
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+ rts
+
+/* We loop from most significant end of the arrays, which is only
+ permissable if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination. */
+
+L(Lspecial:)
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+ lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+ lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+ movel R(s_size),R(d0)
+ asll #2,R(d0)
+ addl R(s_size),R(s_ptr)
+ addl R(s_size),R(res_ptr)
+#endif
+
+ clrl R(d0) /* initialize carry */
+ eorw #1,R(s_size)
+ lsrl #1,R(s_size)
+ bcc L(LL1)
+ subql #1,R(s_size)
+
+L(LLoop:)
+ movel MEM_PREDEC(s_ptr),R(d2)
+ roxrl #1,R(d2)
+ movel R(d2),MEM_PREDEC(res_ptr)
+L(LL1:)
+ movel MEM_PREDEC(s_ptr),R(d2)
+ roxrl #1,R(d2)
+ movel R(d2),MEM_PREDEC(res_ptr)
+
+ dbf R(s_size),L(LLoop)
+ roxrl #1,R(d0) /* save cy in msb */
+ subl #0x10000,R(s_size)
+ bcs L(LLend)
+ addl R(d0),R(d0) /* restore cy */
+ bra L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame. */
+ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+ rts
+EPILOG(__gmpn_rshift)
diff --git a/rts/gmp/mpn/m68k/sub_n.S b/rts/gmp/mpn/m68k/sub_n.S
new file mode 100644
index 0000000000..ce45b24db5
--- /dev/null
+++ b/rts/gmp/mpn/m68k/sub_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+ INPUT PARAMETERS
+ res_ptr (sp + 4)
+ s1_ptr (sp + 8)
+ s2_ptr (sp + 16)
+ size (sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+ TEXT
+ ALIGN
+ GLOBL C_SYMBOL_NAME(__gmpn_sub_n)
+
+C_SYMBOL_NAME(__gmpn_sub_n:)
+PROLOG(__gmpn_sub_n)
+/* Save used registers on the stack. */
+ movel R(d2),MEM_PREDEC(sp)
+ movel R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers. Better use movem? */
+ movel MEM_DISP(sp,12),R(a2)
+ movel MEM_DISP(sp,16),R(a0)
+ movel MEM_DISP(sp,20),R(a1)
+ movel MEM_DISP(sp,24),R(d2)
+
+ eorw #1,R(d2)
+ lsrl #1,R(d2)
+ bcc L(L1)
+ subql #1,R(d2) /* clears cy as side effect */
+
+L(Loop:)
+ movel MEM_POSTINC(a0),R(d0)
+ movel MEM_POSTINC(a1),R(d1)
+ subxl R(d1),R(d0)
+ movel R(d0),MEM_POSTINC(a2)
+L(L1:) movel MEM_POSTINC(a0),R(d0)
+ movel MEM_POSTINC(a1),R(d1)
+ subxl R(d1),R(d0)
+ movel R(d0),MEM_POSTINC(a2)
+
+ dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */
+ subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */
+ subl #0x10000,R(d2)
+ bcs L(L2)
+ addl R(d0),R(d0) /* restore cy */
+ bra L(Loop)
+
+L(L2:)
+ negl R(d0)
+
+/* Restore used registers from stack frame. */
+ movel MEM_POSTINC(sp),R(a2)
+ movel MEM_POSTINC(sp),R(d2)
+
+ rts
+EPILOG(__gmpn_sub_n)
diff --git a/rts/gmp/mpn/m68k/syntax.h b/rts/gmp/mpn/m68k/syntax.h
new file mode 100644
index 0000000000..9eec279c06
--- /dev/null
+++ b/rts/gmp/mpn/m68k/syntax.h
@@ -0,0 +1,177 @@
+/* asm.h -- Definitions for 68k syntax variations.
+
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#undef ALIGN
+
+#ifdef MIT_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)base@
+#define MEM_DISP(base,displacement)base@(displacement)
+#define MEM_INDX(base,idx,size_suffix)base@(idx:size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)base@(idx:size_suffix:scale)
+#define MEM_PREDEC(memory_base)memory_base@-
+#define MEM_POSTINC(memory_base)memory_base@+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#define moveql moveq
+/* Use variable sized opcodes. */
+#define bcc jcc
+#define bcs jcs
+#define bls jls
+#define beq jeq
+#define bne jne
+#define bra jra
+#endif
+
+#ifdef SONY_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#endif
+
+#ifdef MOTOROLA_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT
+#define ALIGN
+#define GLOBL XDEF
+#define lea LEA
+#define movel MOVE.L
+#define moveml MOVEM.L
+#define moveql MOVEQ.L
+#define cmpl CMP.L
+#define orl OR.L
+#define clrl CLR.L
+#define eorw EOR.W
+#define lsrl LSR.L
+#define lsll LSL.L
+#define roxrl ROXR.L
+#define roxll ROXL.L
+#define addl ADD.L
+#define addxl ADDX.L
+#define addql ADDQ.L
+#define subl SUB.L
+#define subxl SUBX.L
+#define subql SUBQ.L
+#define negl NEG.L
+#define mulul MULU.L
+#define bcc BCC
+#define bcs BCS
+#define bls BLS
+#define beq BEQ
+#define bne BNE
+#define bra BRA
+#define dbf DBF
+#define rts RTS
+#define d0 D0
+#define d1 D1
+#define d2 D2
+#define d3 D3
+#define d4 D4
+#define d5 D5
+#define d6 D6
+#define d7 D7
+#define a0 A0
+#define a1 A1
+#define a2 A2
+#define a3 A3
+#define a4 A4
+#define a5 A5
+#define a6 A6
+#define a7 A7
+#define sp SP
+#endif
+
+#ifdef ELF_SYNTAX
+#define PROLOG(name) .type name,@function
+#define EPILOG(name) .size name,.-name
+#define MEM(base)(R(base))
+#define MEM_DISP(base,displacement)(displacement,R(base))
+#define MEM_PREDEC(memory_base)-(R(memory_base))
+#define MEM_POSTINC(memory_base)(R(memory_base))+
+#ifdef __STDC__
+#define R_(r)%##r
+#define R(r)R_(r)
+#define MEM_INDX_(base,idx,size_suffix)(R(base),R(idx##.##size_suffix))
+#define MEM_INDX(base,idx,size_suffix)MEM_INDX_(base,idx,size_suffix)
+#define MEM_INDX1_(base,idx,size_suffix,scale)(R(base),R(idx##.##size_suffix*scale))
+#define MEM_INDX1(base,idx,size_suffix,scale)MEM_INDX1_(base,idx,size_suffix,scale)
+#define L(label) .##label
+#else
+#define R(r)%/**/r
+#define MEM_INDX(base,idx,size_suffix)(R(base),R(idx).size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(R(base),R(idx).size_suffix*scale)
+#define L(label) ./**/label
+#endif
+#define TEXT .text
+#define ALIGN .align 2
+#define GLOBL .globl
+#define bcc jbcc
+#define bcs jbcs
+#define bls jbls
+#define beq jbeq
+#define bne jbne
+#define bra jbra
+#endif
+
+#if defined (SONY_SYNTAX) || defined (ELF_SYNTAX)
+#define movel move.l
+#define moveml movem.l
+#define moveql moveq.l
+#define cmpl cmp.l
+#define orl or.l
+#define clrl clr.l
+#define eorw eor.w
+#define lsrl lsr.l
+#define lsll lsl.l
+#define roxrl roxr.l
+#define roxll roxl.l
+#define addl add.l
+#define addxl addx.l
+#define addql addq.l
+#define subl sub.l
+#define subxl subx.l
+#define subql subq.l
+#define negl neg.l
+#define mulul mulu.l
+#endif
diff --git a/rts/gmp/mpn/m88k/add_n.s b/rts/gmp/mpn/m88k/add_n.s
new file mode 100644
index 0000000000..0b776c618a
--- /dev/null
+++ b/rts/gmp/mpn/m88k/add_n.s
@@ -0,0 +1,104 @@
+; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r2
+; s1_ptr r3
+; s2_ptr r4
+; size r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention. As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+ text
+ align 16
+ global ___gmpn_add_n
+___gmpn_add_n:
+ ld r6,r3,0 ; read first limb from s1_ptr
+ extu r10,r5,3
+ ld r7,r4,0 ; read first limb from s2_ptr
+
+ subu.co r5,r0,r5 ; (clear carry as side effect)
+ mak r5,r5,3<4>
+ bcnd eq0,r5,Lzero
+
+ or r12,r0,lo16(Lbase)
+ or.u r12,r12,hi16(Lbase)
+ addu r12,r12,r5 ; r12 is address for entering in loop
+
+ extu r5,r5,2 ; divide by 4
+ subu r2,r2,r5 ; adjust res_ptr
+ subu r3,r3,r5 ; adjust s1_ptr
+ subu r4,r4,r5 ; adjust s2_ptr
+
+ or r8,r6,r0
+
+ jmp.n r12
+ or r9,r7,r0
+
+Loop: addu r3,r3,32
+ st r8,r2,28
+ addu r4,r4,32
+ ld r6,r3,0
+ addu r2,r2,32
+ ld r7,r4,0
+Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt)
+Lbase: ld r8,r3,4
+ addu.cio r6,r6,r7
+ ld r9,r4,4
+ st r6,r2,0
+ ld r6,r3,8 ; add 7 + 8r limbs
+ addu.cio r8,r8,r9
+ ld r7,r4,8
+ st r8,r2,4
+ ld r8,r3,12 ; add 6 + 8r limbs
+ addu.cio r6,r6,r7
+ ld r9,r4,12
+ st r6,r2,8
+ ld r6,r3,16 ; add 5 + 8r limbs
+ addu.cio r8,r8,r9
+ ld r7,r4,16
+ st r8,r2,12
+ ld r8,r3,20 ; add 4 + 8r limbs
+ addu.cio r6,r6,r7
+ ld r9,r4,20
+ st r6,r2,16
+ ld r6,r3,24 ; add 3 + 8r limbs
+ addu.cio r8,r8,r9
+ ld r7,r4,24
+ st r8,r2,20
+ ld r8,r3,28 ; add 2 + 8r limbs
+ addu.cio r6,r6,r7
+ ld r9,r4,28
+ st r6,r2,24
+ bcnd.n ne0,r10,Loop ; add 1 + 8r limbs
+ addu.cio r8,r8,r9
+
+ st r8,r2,28 ; store most significant limb
+
+ jmp.n r1
+ addu.ci r2,r0,r0 ; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/add_n.S b/rts/gmp/mpn/m88k/mc88110/add_n.S
new file mode 100644
index 0000000000..843a50dded
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/add_n.S
@@ -0,0 +1,200 @@
+; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr r2
+#define s1_ptr r3
+#define s2_ptr r4
+#define size r5
+
+#include "sysdep.h"
+
+ text
+ align 16
+ global C_SYMBOL_NAME(__gmpn_add_n)
+C_SYMBOL_NAME(__gmpn_add_n):
+ addu.co r0,r0,r0 ; clear cy flag
+ xor r12,s2_ptr,res_ptr
+ bb1 2,r12,L1
+; ** V1a **
+L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+ ld r10,s1_ptr,0
+ addu s1_ptr,s1_ptr,4
+ ld r8,s2_ptr,0
+ addu s2_ptr,s2_ptr,4
+ subu size,size,1
+ addu.co r6,r10,r8
+ st r6,res_ptr,0
+ addu res_ptr,res_ptr,4
+L_v1: cmp r12,size,2
+ bb1 lt,r12,Lend2
+
+ ld r10,s1_ptr,0
+ ld r12,s1_ptr,4
+ ld.d r8,s2_ptr,0
+ subu size,size,10
+ bcnd lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+ align 8
+Loop1: subu size,size,8
+ addu.cio r6,r10,r8
+ ld r10,s1_ptr,8
+ addu.cio r7,r12,r9
+ ld r12,s1_ptr,12
+ ld.d r8,s2_ptr,8
+ st.d r6,res_ptr,0
+ addu.cio r6,r10,r8
+ ld r10,s1_ptr,16
+ addu.cio r7,r12,r9
+ ld r12,s1_ptr,20
+ ld.d r8,s2_ptr,16
+ st.d r6,res_ptr,8
+ addu.cio r6,r10,r8
+ ld r10,s1_ptr,24
+ addu.cio r7,r12,r9
+ ld r12,s1_ptr,28
+ ld.d r8,s2_ptr,24
+ st.d r6,res_ptr,16
+ addu.cio r6,r10,r8
+ ld r10,s1_ptr,32
+ addu.cio r7,r12,r9
+ ld r12,s1_ptr,36
+ addu s1_ptr,s1_ptr,32
+ ld.d r8,s2_ptr,32
+ addu s2_ptr,s2_ptr,32
+ st.d r6,res_ptr,24
+ addu res_ptr,res_ptr,32
+ bcnd ge0,size,Loop1
+
+Lfin1: addu size,size,8-2
+ bcnd lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1: addu.cio r6,r10,r8
+ ld r10,s1_ptr,8
+ addu.cio r7,r12,r9
+ ld r12,s1_ptr,12
+ ld.d r8,s2_ptr,8
+ st.d r6,res_ptr,0
+ subu size,size,2
+ addu s1_ptr,s1_ptr,8
+ addu s2_ptr,s2_ptr,8
+ addu res_ptr,res_ptr,8
+ bcnd ge0,size,Loope1
+Lend1: addu.cio r6,r10,r8
+ addu.cio r7,r12,r9
+ st.d r6,res_ptr,0
+
+ bb0 0,size,Lret1
+/* Add last limb */
+ ld r10,s1_ptr,8
+ ld r8,s2_ptr,8
+ addu.cio r6,r10,r8
+ st r6,res_ptr,8
+
+Lret1: jmp.n r1
+ addu.ci r2,r0,r0 ; return carry-out from most sign. limb
+
+L1: xor r12,s1_ptr,res_ptr
+ bb1 2,r12,L2
+; ** V1b **
+ or r12,r0,s2_ptr
+ or s2_ptr,r0,s1_ptr
+ or s1_ptr,r0,r12
+ br L0
+
+; ** V2 **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+ alignment of s2_ptr and res_ptr differ. Since there are only two ways
+ things can be aligned (that we care about) we now know that the alignment
+ of s1_ptr and s2_ptr are the same. */
+
+L2: cmp r12,size,1
+ bb1 eq,r12,Ljone
+ bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+ ld r10,s1_ptr,0
+ addu s1_ptr,s1_ptr,4
+ ld r8,s2_ptr,0
+ addu s2_ptr,s2_ptr,4
+ subu size,size,1
+ addu.co r6,r10,r8
+ st r6,res_ptr,0
+ addu res_ptr,res_ptr,4
+
+L_v2: subu size,size,8
+ bcnd lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+ align 8
+Loop2: subu size,size,8
+ ld.d r8,s1_ptr,0
+ ld.d r6,s2_ptr,0
+ addu.cio r8,r8,r6
+ st r8,res_ptr,0
+ addu.cio r9,r9,r7
+ st r9,res_ptr,4
+ ld.d r8,s1_ptr,8
+ ld.d r6,s2_ptr,8
+ addu.cio r8,r8,r6
+ st r8,res_ptr,8
+ addu.cio r9,r9,r7
+ st r9,res_ptr,12
+ ld.d r8,s1_ptr,16
+ ld.d r6,s2_ptr,16
+ addu.cio r8,r8,r6
+ st r8,res_ptr,16
+ addu.cio r9,r9,r7
+ st r9,res_ptr,20
+ ld.d r8,s1_ptr,24
+ ld.d r6,s2_ptr,24
+ addu.cio r8,r8,r6
+ st r8,res_ptr,24
+ addu.cio r9,r9,r7
+ st r9,res_ptr,28
+ addu s1_ptr,s1_ptr,32
+ addu s2_ptr,s2_ptr,32
+ addu res_ptr,res_ptr,32
+ bcnd ge0,size,Loop2
+
+Lfin2: addu size,size,8-2
+ bcnd lt0,size,Lend2
+Loope2: ld.d r8,s1_ptr,0
+ ld.d r6,s2_ptr,0
+ addu.cio r8,r8,r6
+ st r8,res_ptr,0
+ addu.cio r9,r9,r7
+ st r9,res_ptr,4
+ subu size,size,2
+ addu s1_ptr,s1_ptr,8
+ addu s2_ptr,s2_ptr,8
+ addu res_ptr,res_ptr,8
+ bcnd ge0,size,Loope2
+Lend2: bb0 0,size,Lret2
+/* Add last limb */
+Ljone: ld r10,s1_ptr,0
+ ld r8,s2_ptr,0
+ addu.cio r6,r10,r8
+ st r6,res_ptr,0
+
+Lret2: jmp.n r1
+ addu.ci r2,r0,r0 ; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
new file mode 100644
index 0000000000..7d97c87c79
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
@@ -0,0 +1,61 @@
+; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r2
+; s1_ptr r3
+; size r4
+; s2_limb r5
+
+ text
+ align 16
+ global ___gmpn_addmul_1
+___gmpn_addmul_1:
+ lda r3,r3[r4]
+ lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval
+ subu r4,r0,r4
+ addu.co r2,r0,r0 ; r2 = cy = 0
+
+ ld r6,r3[r4]
+ addu r4,r4,1
+ subu r8,r8,4
+ bcnd.n eq0,r4,Lend
+ mulu.d r10,r6,r5
+
+Loop: ld r7,r8[r4]
+ ld r6,r3[r4]
+ addu.cio r9,r11,r2
+ addu.ci r2,r10,r0
+ addu.co r9,r9,r7
+ st r9,r8[r4]
+ addu r4,r4,1
+ mulu.d r10,r6,r5
+ bcnd ne0,r4,Loop
+
+Lend: ld r7,r8,0
+ addu.cio r9,r11,r2
+ addu.ci r2,r10,r0
+ addu.co r9,r9,r7
+ st r9,r8,0
+ jmp.n r1
+ addu.ci r2,r2,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/mul_1.s b/rts/gmp/mpn/m88k/mc88110/mul_1.s
new file mode 100644
index 0000000000..b8483afa91
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/mul_1.s
@@ -0,0 +1,59 @@
+; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r2
+; s1_ptr r3
+; size r4
+; s2_limb r5
+
+ text
+ align 16
+ global ___gmpn_mul_1
+___gmpn_mul_1:
+ ; Make S1_PTR and RES_PTR point at the end of their blocks
+ ; and negate SIZE.
+ lda r3,r3[r4]
+ lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval
+ subu r4,r0,r4
+
+ addu.co r2,r0,r0 ; r2 = cy = 0
+
+ ld r6,r3[r4]
+ addu r4,r4,1
+ mulu.d r10,r6,r5
+ bcnd.n eq0,r4,Lend
+ subu r8,r8,8
+
+Loop: ld r6,r3[r4]
+ addu.cio r9,r11,r2
+ or r2,r10,r0 ; could be avoided if unrolled
+ addu r4,r4,1
+ mulu.d r10,r6,r5
+ bcnd.n ne0,r4,Loop
+ st r9,r8[r4]
+
+Lend: addu.cio r9,r11,r2
+ st r9,r8,4
+ jmp.n r1
+ addu.ci r2,r10,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/sub_n.S b/rts/gmp/mpn/m88k/mc88110/sub_n.S
new file mode 100644
index 0000000000..715a3faf25
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/sub_n.S
@@ -0,0 +1,276 @@
+; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr r2
+#define s1_ptr r3
+#define s2_ptr r4
+#define size r5
+
+#include "sysdep.h"
+
+ text
+ align 16
+ global C_SYMBOL_NAME(__gmpn_sub_n)
+C_SYMBOL_NAME(__gmpn_sub_n):
+ subu.co r0,r0,r0 ; set cy flag
+ xor r12,s2_ptr,res_ptr
+ bb1 2,r12,L1
+; ** V1a **
+L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+ ld r10,s1_ptr,0
+ addu s1_ptr,s1_ptr,4
+ ld r8,s2_ptr,0
+ addu s2_ptr,s2_ptr,4
+ subu size,size,1
+ subu.co r6,r10,r8
+ st r6,res_ptr,0
+ addu res_ptr,res_ptr,4
+L_v1: cmp r12,size,2
+ bb1 lt,r12,Lend2
+
+ ld r10,s1_ptr,0
+ ld r12,s1_ptr,4
+ ld.d r8,s2_ptr,0
+ subu size,size,10
+ bcnd lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+ align 8
+Loop1: subu size,size,8
+ subu.cio r6,r10,r8
+ ld r10,s1_ptr,8
+ subu.cio r7,r12,r9
+ ld r12,s1_ptr,12
+ ld.d r8,s2_ptr,8
+ st.d r6,res_ptr,0
+ subu.cio r6,r10,r8
+ ld r10,s1_ptr,16
+ subu.cio r7,r12,r9
+ ld r12,s1_ptr,20
+ ld.d r8,s2_ptr,16
+ st.d r6,res_ptr,8
+ subu.cio r6,r10,r8
+ ld r10,s1_ptr,24
+ subu.cio r7,r12,r9
+ ld r12,s1_ptr,28
+ ld.d r8,s2_ptr,24
+ st.d r6,res_ptr,16
+ subu.cio r6,r10,r8
+ ld r10,s1_ptr,32
+ subu.cio r7,r12,r9
+ ld r12,s1_ptr,36
+ addu s1_ptr,s1_ptr,32
+ ld.d r8,s2_ptr,32
+ addu s2_ptr,s2_ptr,32
+ st.d r6,res_ptr,24
+ addu res_ptr,res_ptr,32
+ bcnd ge0,size,Loop1
+
+Lfin1: addu size,size,8-2
+ bcnd lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1: subu.cio r6,r10,r8
+ ld r10,s1_ptr,8
+ subu.cio r7,r12,r9
+ ld r12,s1_ptr,12
+ ld.d r8,s2_ptr,8
+ st.d r6,res_ptr,0
+ subu size,size,2
+ addu s1_ptr,s1_ptr,8
+ addu s2_ptr,s2_ptr,8
+ addu res_ptr,res_ptr,8
+ bcnd ge0,size,Loope1
+Lend1: subu.cio r6,r10,r8
+ subu.cio r7,r12,r9
+ st.d r6,res_ptr,0
+
+ bb0 0,size,Lret1
+/* Add last limb */
+ ld r10,s1_ptr,8
+ ld r8,s2_ptr,8
+ subu.cio r6,r10,r8
+ st r6,res_ptr,8
+
+Lret1: addu.ci r2,r0,r0 ; return carry-out from most sign. limb
+ jmp.n r1
+ xor r2,r2,1
+
+L1: xor r12,s1_ptr,res_ptr
+ bb1 2,r12,L2
+; ** V1b **
+ bb0 2,res_ptr,L_v1b ; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+ ld r10,s2_ptr,0
+ addu s2_ptr,s2_ptr,4
+ ld r8,s1_ptr,0
+ addu s1_ptr,s1_ptr,4
+ subu size,size,1
+ subu.co r6,r8,r10
+ st r6,res_ptr,0
+ addu res_ptr,res_ptr,4
+L_v1b: cmp r12,size,2
+ bb1 lt,r12,Lend2
+
+ ld r10,s2_ptr,0
+ ld r12,s2_ptr,4
+ ld.d r8,s1_ptr,0
+ subu size,size,10
+ bcnd lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+ align 8
+Loop1b: subu size,size,8
+ subu.cio r6,r8,r10
+ ld r10,s2_ptr,8
+ subu.cio r7,r9,r12
+ ld r12,s2_ptr,12
+ ld.d r8,s1_ptr,8
+ st.d r6,res_ptr,0
+ subu.cio r6,r8,r10
+ ld r10,s2_ptr,16
+ subu.cio r7,r9,r12
+ ld r12,s2_ptr,20
+ ld.d r8,s1_ptr,16
+ st.d r6,res_ptr,8
+ subu.cio r6,r8,r10
+ ld r10,s2_ptr,24
+ subu.cio r7,r9,r12
+ ld r12,s2_ptr,28
+ ld.d r8,s1_ptr,24
+ st.d r6,res_ptr,16
+ subu.cio r6,r8,r10
+ ld r10,s2_ptr,32
+ subu.cio r7,r9,r12
+ ld r12,s2_ptr,36
+ addu s2_ptr,s2_ptr,32
+ ld.d r8,s1_ptr,32
+ addu s1_ptr,s1_ptr,32
+ st.d r6,res_ptr,24
+ addu res_ptr,res_ptr,32
+ bcnd ge0,size,Loop1b
+
+Lfin1b: addu size,size,8-2
+ bcnd lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+ ld r10,s2_ptr,8
+ subu.cio r7,r9,r12
+ ld r12,s2_ptr,12
+ ld.d r8,s1_ptr,8
+ st.d r6,res_ptr,0
+ subu size,size,2
+ addu s1_ptr,s1_ptr,8
+ addu s2_ptr,s2_ptr,8
+ addu res_ptr,res_ptr,8
+ bcnd ge0,size,Loope1b
+Lend1b: subu.cio r6,r8,r10
+ subu.cio r7,r9,r12
+ st.d r6,res_ptr,0
+
+ bb0 0,size,Lret1b
+/* Add last limb */
+ ld r10,s2_ptr,8
+ ld r8,s1_ptr,8
+ subu.cio r6,r8,r10
+ st r6,res_ptr,8
+
+Lret1b: addu.ci r2,r0,r0 ; return carry-out from most sign. limb
+ jmp.n r1
+ xor r2,r2,1
+
+; ** V2 **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+ alignment of s2_ptr and res_ptr differ. Since there are only two ways
+ things can be aligned (that we care about) we now know that the alignment
+ of s1_ptr and s2_ptr are the same. */
+
+L2: cmp r12,size,1
+ bb1 eq,r12,Ljone
+ bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+ ld r10,s1_ptr,0
+ addu s1_ptr,s1_ptr,4
+ ld r8,s2_ptr,0
+ addu s2_ptr,s2_ptr,4
+ subu size,size,1
+ subu.co r6,r10,r8
+ st r6,res_ptr,0
+ addu res_ptr,res_ptr,4
+
+L_v2: subu size,size,8
+ bcnd lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+ align 8
+Loop2: subu size,size,8
+ ld.d r8,s1_ptr,0
+ ld.d r6,s2_ptr,0
+ subu.cio r8,r8,r6
+ st r8,res_ptr,0
+ subu.cio r9,r9,r7
+ st r9,res_ptr,4
+ ld.d r8,s1_ptr,8
+ ld.d r6,s2_ptr,8
+ subu.cio r8,r8,r6
+ st r8,res_ptr,8
+ subu.cio r9,r9,r7
+ st r9,res_ptr,12
+ ld.d r8,s1_ptr,16
+ ld.d r6,s2_ptr,16
+ subu.cio r8,r8,r6
+ st r8,res_ptr,16
+ subu.cio r9,r9,r7
+ st r9,res_ptr,20
+ ld.d r8,s1_ptr,24
+ ld.d r6,s2_ptr,24
+ subu.cio r8,r8,r6
+ st r8,res_ptr,24
+ subu.cio r9,r9,r7
+ st r9,res_ptr,28
+ addu s1_ptr,s1_ptr,32
+ addu s2_ptr,s2_ptr,32
+ addu res_ptr,res_ptr,32
+ bcnd ge0,size,Loop2
+
+Lfin2: addu size,size,8-2
+ bcnd lt0,size,Lend2
+Loope2: ld.d r8,s1_ptr,0
+ ld.d r6,s2_ptr,0
+ subu.cio r8,r8,r6
+ st r8,res_ptr,0
+ subu.cio r9,r9,r7
+ st r9,res_ptr,4
+ subu size,size,2
+ addu s1_ptr,s1_ptr,8
+ addu s2_ptr,s2_ptr,8
+ addu res_ptr,res_ptr,8
+ bcnd ge0,size,Loope2
+Lend2: bb0 0,size,Lret2
+/* Add last limb */
+Ljone: ld r10,s1_ptr,0
+ ld r8,s2_ptr,0
+ subu.cio r6,r10,r8
+ st r6,res_ptr,0
+
+Lret2: addu.ci r2,r0,r0 ; return carry-out from most sign. limb
+ jmp.n r1
+ xor r2,r2,1
diff --git a/rts/gmp/mpn/m88k/mul_1.s b/rts/gmp/mpn/m88k/mul_1.s
new file mode 100644
index 0000000000..06370837ef
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mul_1.s
@@ -0,0 +1,127 @@
+; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r2
+; s1_ptr r3
+; size r4
+; s2_limb r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention. It might be tempting to move the
+; ld instruction in the loops down to save 2 cycles (less WB contention),
+; but that looses because the ultimate value will be read from outside
+; the allocated space. But if we handle the ultimate multiplication in
+; the tail, we can do this.
+; 3. Make the multiplication with less instructions. I think the code for
+; (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb. (Assuming infinite unrolling.)
+
+ text
+ align 16
+ global ___gmpn_mul_1
+___gmpn_mul_1:
+
+ ; Make S1_PTR and RES_PTR point at the end of their blocks
+ ; and negate SIZE.
+ lda r3,r3[r4]
+ lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval
+ subu r4,r0,r4
+
+ addu.co r2,r0,r0 ; r2 = cy = 0
+ ld r9,r3[r4]
+ mask r7,r5,0xffff ; r7 = lo(S2_LIMB)
+ extu r8,r5,16 ; r8 = hi(S2_LIMB)
+ bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0)
+ subu r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+ ; Make a stack frame and save r25 and r26
+ subu r31,r31,16
+ st.d r25,r31,8
+
+ ; Enter the loop in the middle
+ br.n L1
+ addu r4,r4,1
+
+Loop: ld r9,r3[r4]
+ st r26,r6[r4]
+; bcnd ne0,r0,0 ; bubble
+ addu r4,r4,1
+L1: mul r26,r9,r5 ; low word of product mul_1 WB ld
+ mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1
+ mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1
+ mul r10,r12,r8 ; r10 = prod_1a mul_3
+ extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1
+ mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1
+ mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2
+ extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3
+ addu r10,r10,r11 ; addu_1 WB extu_2
+; bcnd ne0,r0,0 ; bubble WB addu_1
+ addu.co r10,r10,r12 ; WB mul_4
+ mask.u r10,r10,0xffff ; move the 16 most significant bits...
+ addu.ci r10,r10,r0 ; ...to the low half of the word...
+ rot r10,r10,16 ; ...and put carry in pos 16.
+ addu.co r26,r26,r2 ; add old carry limb
+ bcnd.n ne0,r4,Loop
+ addu.ci r2,r25,r10 ; compute new carry limb
+
+ st r26,r6[r4]
+ ld.d r25,r31,8
+ jmp.n r1
+ addu r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+ ; Enter the loop in the middle
+ br.n SL1
+ addu r4,r4,1
+
+SLoop: ld r9,r3[r4] ;
+ st r8,r6[r4] ;
+ addu r4,r4,1 ;
+SL1: mul r8,r9,r5 ; low word of product
+ mask r12,r9,0xffff ; r12 = lo(s1_limb)
+ extu r13,r9,16 ; r13 = hi(s1_limb)
+ mul r11,r12,r7 ; r11 = prod_0
+ mul r12,r13,r7 ; r12 = prod_1b
+ addu.cio r8,r8,r2 ; add old carry limb
+ extu r10,r11,16 ; r11 = hi(prod_0)
+ addu r10,r10,r12 ;
+ bcnd.n ne0,r4,SLoop
+ extu r2,r10,16 ; r2 = new carry limb
+
+ jmp.n r1
+ st r8,r6[r4]
diff --git a/rts/gmp/mpn/m88k/sub_n.s b/rts/gmp/mpn/m88k/sub_n.s
new file mode 100644
index 0000000000..2fd345a135
--- /dev/null
+++ b/rts/gmp/mpn/m88k/sub_n.s
@@ -0,0 +1,106 @@
+; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr r2
+; s1_ptr r3
+; s2_ptr r4
+; size r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention. As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+ text
+ align 16
+ global ___gmpn_sub_n
+___gmpn_sub_n:
+ ld r6,r3,0 ; read first limb from s1_ptr
+ extu r10,r5,3
+ ld r7,r4,0 ; read first limb from s2_ptr
+
+ subu r5,r0,r5
+ mak r5,r5,3<4>
+ bcnd.n eq0,r5,Lzero
+ subu.co r0,r0,r0 ; initialize carry
+
+ or r12,r0,lo16(Lbase)
+ or.u r12,r12,hi16(Lbase)
+ addu r12,r12,r5 ; r12 is address for entering in loop
+
+ extu r5,r5,2 ; divide by 4
+ subu r2,r2,r5 ; adjust res_ptr
+ subu r3,r3,r5 ; adjust s1_ptr
+ subu r4,r4,r5 ; adjust s2_ptr
+
+ or r8,r6,r0
+
+ jmp.n r12
+ or r9,r7,r0
+
+Loop: addu r3,r3,32
+ st r8,r2,28
+ addu r4,r4,32
+ ld r6,r3,0
+ addu r2,r2,32
+ ld r7,r4,0
+Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt)
+Lbase: ld r8,r3,4
+ subu.cio r6,r6,r7
+ ld r9,r4,4
+ st r6,r2,0
+ ld r6,r3,8 ; subtract 7 + 8r limbs
+ subu.cio r8,r8,r9
+ ld r7,r4,8
+ st r8,r2,4
+ ld r8,r3,12 ; subtract 6 + 8r limbs
+ subu.cio r6,r6,r7
+ ld r9,r4,12
+ st r6,r2,8
+ ld r6,r3,16 ; subtract 5 + 8r limbs
+ subu.cio r8,r8,r9
+ ld r7,r4,16
+ st r8,r2,12
+ ld r8,r3,20 ; subtract 4 + 8r limbs
+ subu.cio r6,r6,r7
+ ld r9,r4,20
+ st r6,r2,16
+ ld r6,r3,24 ; subtract 3 + 8r limbs
+ subu.cio r8,r8,r9
+ ld r7,r4,24
+ st r8,r2,20
+ ld r8,r3,28 ; subtract 2 + 8r limbs
+ subu.cio r6,r6,r7
+ ld r9,r4,28
+ st r6,r2,24
+ bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs
+ subu.cio r8,r8,r9
+
+ st r8,r2,28 ; store most significant limb
+
+ addu.ci r2,r0,r0 ; return carry-out from most sign. limb
+ jmp.n r1
+ xor r2,r2,1
diff --git a/rts/gmp/mpn/mips2/add_n.s b/rts/gmp/mpn/mips2/add_n.s
new file mode 100644
index 0000000000..5c3c7fc8a1
--- /dev/null
+++ b/rts/gmp/mpn/mips2/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # s2_ptr $6
+ # size $7
+
+ .text
+ .align 2
+ .globl __gmpn_add_n
+ .ent __gmpn_add_n
+__gmpn_add_n:
+ .set noreorder
+ .set nomacro
+
+ lw $10,0($5)
+ lw $11,0($6)
+
+ addiu $7,$7,-1
+ and $9,$7,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ move $2,$0
+
+ subu $7,$7,$9
+
+.Loop0: addiu $9,$9,-1
+ lw $12,4($5)
+ addu $11,$11,$2
+ lw $13,4($6)
+ sltu $8,$11,$2
+ addu $11,$10,$11
+ sltu $2,$11,$10
+ sw $11,0($4)
+ or $2,$2,$8
+
+ addiu $5,$5,4
+ addiu $6,$6,4
+ move $10,$12
+ move $11,$13
+ bne $9,$0,.Loop0
+ addiu $4,$4,4
+
+.L0: beq $7,$0,.Lend
+ nop
+
+.Loop: addiu $7,$7,-4
+
+ lw $12,4($5)
+ addu $11,$11,$2
+ lw $13,4($6)
+ sltu $8,$11,$2
+ addu $11,$10,$11
+ sltu $2,$11,$10
+ sw $11,0($4)
+ or $2,$2,$8
+
+ lw $10,8($5)
+ addu $13,$13,$2
+ lw $11,8($6)
+ sltu $8,$13,$2
+ addu $13,$12,$13
+ sltu $2,$13,$12
+ sw $13,4($4)
+ or $2,$2,$8
+
+ lw $12,12($5)
+ addu $11,$11,$2
+ lw $13,12($6)
+ sltu $8,$11,$2
+ addu $11,$10,$11
+ sltu $2,$11,$10
+ sw $11,8($4)
+ or $2,$2,$8
+
+ lw $10,16($5)
+ addu $13,$13,$2
+ lw $11,16($6)
+ sltu $8,$13,$2
+ addu $13,$12,$13
+ sltu $2,$13,$12
+ sw $13,12($4)
+ or $2,$2,$8
+
+ addiu $5,$5,16
+ addiu $6,$6,16
+
+ bne $7,$0,.Loop
+ addiu $4,$4,16
+
+.Lend: addu $11,$11,$2
+ sltu $8,$11,$2
+ addu $11,$10,$11
+ sltu $2,$11,$10
+ sw $11,0($4)
+ j $31
+ or $2,$2,$8
+
+ .end __gmpn_add_n
diff --git a/rts/gmp/mpn/mips2/addmul_1.s b/rts/gmp/mpn/mips2/addmul_1.s
new file mode 100644
index 0000000000..1e5037751b
--- /dev/null
+++ b/rts/gmp/mpn/mips2/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_addmul_1
+ .ent __gmpn_addmul_1
+__gmpn_addmul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ lw $8,0($5)
+
+ # warm up phase 1
+ addiu $5,$5,4
+ multu $8,$7
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC1
+ lw $8,0($5) # load new s1 limb as early as possible
+
+Loop: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addiu $5,$5,4
+ addu $3,$3,$2 # add old carry limb to low product limb
+ multu $8,$7
+ lw $8,0($5) # load new s1 limb as early as possible
+ addiu $6,$6,-1 # decrement loop counter
+ sltu $2,$3,$2 # carry from previous addition -> $2
+ addu $3,$10,$3
+ sltu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ addiu $4,$4,4
+ bne $6,$0,Loop
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addu $3,$3,$2
+ sltu $2,$3,$2
+ multu $8,$7
+ addu $3,$10,$3
+ sltu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ addiu $4,$4,4
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addu $3,$3,$2
+ sltu $2,$3,$2
+ addu $3,$10,$3
+ sltu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ j $31
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips2/lshift.s b/rts/gmp/mpn/mips2/lshift.s
new file mode 100644
index 0000000000..2ca3a3c800
--- /dev/null
+++ b/rts/gmp/mpn/mips2/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS2 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # src_ptr $5
+ # size $6
+ # cnt $7
+
+ .text
+ .align 2
+ .globl __gmpn_lshift
+ .ent __gmpn_lshift
+__gmpn_lshift:
+ .set noreorder
+ .set nomacro
+
+ sll $2,$6,2
+ addu $5,$5,$2 # make r5 point at end of src
+ lw $10,-4($5) # load first limb
+ subu $13,$0,$7
+ addu $4,$4,$2 # make r4 point at end of res
+ addiu $6,$6,-1
+ and $9,$6,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ srl $2,$10,$13 # compute function result
+
+ subu $6,$6,$9
+
+.Loop0: lw $3,-8($5)
+ addiu $4,$4,-4
+ addiu $5,$5,-4
+ addiu $9,$9,-1
+ sll $11,$10,$7
+ srl $12,$3,$13
+ move $10,$3
+ or $8,$11,$12
+ bne $9,$0,.Loop0
+ sw $8,0($4)
+
+.L0: beq $6,$0,.Lend
+ nop
+
+.Loop: lw $3,-8($5)
+ addiu $4,$4,-16
+ addiu $6,$6,-4
+ sll $11,$10,$7
+ srl $12,$3,$13
+
+ lw $10,-12($5)
+ sll $14,$3,$7
+ or $8,$11,$12
+ sw $8,12($4)
+ srl $9,$10,$13
+
+ lw $3,-16($5)
+ sll $11,$10,$7
+ or $8,$14,$9
+ sw $8,8($4)
+ srl $12,$3,$13
+
+ lw $10,-20($5)
+ sll $14,$3,$7
+ or $8,$11,$12
+ sw $8,4($4)
+ srl $9,$10,$13
+
+ addiu $5,$5,-16
+ or $8,$14,$9
+ bgtz $6,.Loop
+ sw $8,0($4)
+
+.Lend: sll $8,$10,$7
+ j $31
+ sw $8,-4($4)
+ .end __gmpn_lshift
diff --git a/rts/gmp/mpn/mips2/mul_1.s b/rts/gmp/mpn/mips2/mul_1.s
new file mode 100644
index 0000000000..ea8aa26809
--- /dev/null
+++ b/rts/gmp/mpn/mips2/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_mul_1
+ .ent __gmpn_mul_1
+__gmpn_mul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ lw $8,0($5)
+
+ # warm up phase 1
+ addiu $5,$5,4
+ multu $8,$7
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC1
+ lw $8,0($5) # load new s1 limb as early as possible
+
+Loop: mflo $10
+ mfhi $9
+ addiu $5,$5,4
+ addu $10,$10,$2 # add old carry limb to low product limb
+ multu $8,$7
+ lw $8,0($5) # load new s1 limb as early as possible
+ addiu $6,$6,-1 # decrement loop counter
+ sltu $2,$10,$2 # carry from previous addition -> $2
+ sw $10,0($4)
+ addiu $4,$4,4
+ bne $6,$0,Loop
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: mflo $10
+ mfhi $9
+ addu $10,$10,$2
+ sltu $2,$10,$2
+ multu $8,$7
+ sw $10,0($4)
+ addiu $4,$4,4
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: mflo $10
+ mfhi $9
+ addu $10,$10,$2
+ sltu $2,$10,$2
+ sw $10,0($4)
+ j $31
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_mul_1
diff --git a/rts/gmp/mpn/mips2/rshift.s b/rts/gmp/mpn/mips2/rshift.s
new file mode 100644
index 0000000000..37c8f39cb4
--- /dev/null
+++ b/rts/gmp/mpn/mips2/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS2 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # src_ptr $5
+ # size $6
+ # cnt $7
+
+ .text
+ .align 2
+ .globl __gmpn_rshift
+ .ent __gmpn_rshift
+__gmpn_rshift:
+ .set noreorder
+ .set nomacro
+
+ lw $10,0($5) # load first limb
+ subu $13,$0,$7
+ addiu $6,$6,-1
+ and $9,$6,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ sll $2,$10,$13 # compute function result
+
+ subu $6,$6,$9
+
+.Loop0: lw $3,4($5)
+ addiu $4,$4,4
+ addiu $5,$5,4
+ addiu $9,$9,-1
+ srl $11,$10,$7
+ sll $12,$3,$13
+ move $10,$3
+ or $8,$11,$12
+ bne $9,$0,.Loop0
+ sw $8,-4($4)
+
+.L0: beq $6,$0,.Lend
+ nop
+
+.Loop: lw $3,4($5)
+ addiu $4,$4,16
+ addiu $6,$6,-4
+ srl $11,$10,$7
+ sll $12,$3,$13
+
+ lw $10,8($5)
+ srl $14,$3,$7
+ or $8,$11,$12
+ sw $8,-16($4)
+ sll $9,$10,$13
+
+ lw $3,12($5)
+ srl $11,$10,$7
+ or $8,$14,$9
+ sw $8,-12($4)
+ sll $12,$3,$13
+
+ lw $10,16($5)
+ srl $14,$3,$7
+ or $8,$11,$12
+ sw $8,-8($4)
+ sll $9,$10,$13
+
+ addiu $5,$5,16
+ or $8,$14,$9
+ bgtz $6,.Loop
+ sw $8,-4($4)
+
+.Lend: srl $8,$10,$7
+ j $31
+ sw $8,0($4)
+ .end __gmpn_rshift
diff --git a/rts/gmp/mpn/mips2/sub_n.s b/rts/gmp/mpn/mips2/sub_n.s
new file mode 100644
index 0000000000..51d34f3ac3
--- /dev/null
+++ b/rts/gmp/mpn/mips2/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # s2_ptr $6
+ # size $7
+
+ .text
+ .align 2
+ .globl __gmpn_sub_n
+ .ent __gmpn_sub_n
+__gmpn_sub_n:
+ .set noreorder
+ .set nomacro
+
+ lw $10,0($5)
+ lw $11,0($6)
+
+ addiu $7,$7,-1
+ and $9,$7,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ move $2,$0
+
+ subu $7,$7,$9
+
+.Loop0: addiu $9,$9,-1
+ lw $12,4($5)
+ addu $11,$11,$2
+ lw $13,4($6)
+ sltu $8,$11,$2
+ subu $11,$10,$11
+ sltu $2,$10,$11
+ sw $11,0($4)
+ or $2,$2,$8
+
+ addiu $5,$5,4
+ addiu $6,$6,4
+ move $10,$12
+ move $11,$13
+ bne $9,$0,.Loop0
+ addiu $4,$4,4
+
+.L0: beq $7,$0,.Lend
+ nop
+
+.Loop: addiu $7,$7,-4
+
+ lw $12,4($5)
+ addu $11,$11,$2
+ lw $13,4($6)
+ sltu $8,$11,$2
+ subu $11,$10,$11
+ sltu $2,$10,$11
+ sw $11,0($4)
+ or $2,$2,$8
+
+ lw $10,8($5)
+ addu $13,$13,$2
+ lw $11,8($6)
+ sltu $8,$13,$2
+ subu $13,$12,$13
+ sltu $2,$12,$13
+ sw $13,4($4)
+ or $2,$2,$8
+
+ lw $12,12($5)
+ addu $11,$11,$2
+ lw $13,12($6)
+ sltu $8,$11,$2
+ subu $11,$10,$11
+ sltu $2,$10,$11
+ sw $11,8($4)
+ or $2,$2,$8
+
+ lw $10,16($5)
+ addu $13,$13,$2
+ lw $11,16($6)
+ sltu $8,$13,$2
+ subu $13,$12,$13
+ sltu $2,$12,$13
+ sw $13,12($4)
+ or $2,$2,$8
+
+ addiu $5,$5,16
+ addiu $6,$6,16
+
+ bne $7,$0,.Loop
+ addiu $4,$4,16
+
+.Lend: addu $11,$11,$2
+ sltu $8,$11,$2
+ subu $11,$10,$11
+ sltu $2,$10,$11
+ sw $11,0($4)
+ j $31
+ or $2,$2,$8
+
+ .end __gmpn_sub_n
diff --git a/rts/gmp/mpn/mips2/submul_1.s b/rts/gmp/mpn/mips2/submul_1.s
new file mode 100644
index 0000000000..495dea3ba2
--- /dev/null
+++ b/rts/gmp/mpn/mips2/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_submul_1
+ .ent __gmpn_submul_1
+__gmpn_submul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ lw $8,0($5)
+
+ # warm up phase 1
+ addiu $5,$5,4
+ multu $8,$7
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ addiu $6,$6,-1
+ beq $6,$0,$LC1
+ lw $8,0($5) # load new s1 limb as early as possible
+
+Loop: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addiu $5,$5,4
+ addu $3,$3,$2 # add old carry limb to low product limb
+ multu $8,$7
+ lw $8,0($5) # load new s1 limb as early as possible
+ addiu $6,$6,-1 # decrement loop counter
+ sltu $2,$3,$2 # carry from previous addition -> $2
+ subu $3,$10,$3
+ sgtu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ addiu $4,$4,4
+ bne $6,$0,Loop
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addu $3,$3,$2
+ sltu $2,$3,$2
+ multu $8,$7
+ subu $3,$10,$3
+ sgtu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ addiu $4,$4,4
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: lw $10,0($4)
+ mflo $3
+ mfhi $9
+ addu $3,$3,$2
+ sltu $2,$3,$2
+ subu $3,$10,$3
+ sgtu $10,$3,$10
+ addu $2,$2,$10
+ sw $3,0($4)
+ j $31
+ addu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_submul_1
diff --git a/rts/gmp/mpn/mips2/umul.s b/rts/gmp/mpn/mips2/umul.s
new file mode 100644
index 0000000000..40e847614c
--- /dev/null
+++ b/rts/gmp/mpn/mips2/umul.s
@@ -0,0 +1,30 @@
+ # Copyright (C) 1999 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+ .text
+ .align 2
+ .globl __umul_ppmm
+ .ent __umul_ppmm
+__umul_ppmm:
+ multu $5,$6
+ mflo $3
+ mfhi $2
+ sw $3,0($4)
+ j $31
+ .end __umul_ppmm
diff --git a/rts/gmp/mpn/mips3/README b/rts/gmp/mpn/mips3/README
new file mode 100644
index 0000000000..e94b2c7460
--- /dev/null
+++ b/rts/gmp/mpn/mips3/README
@@ -0,0 +1,23 @@
+This directory contains mpn functions optimized for MIPS3. Example of
+processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000.
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. On the R4000 and R4400, branches, both the plain and the "likely" ones,
+ take 3 cycles to execute. (The fastest possible loop will take 4 cycles,
+ because of the delay insn.)
+
+ On the R4600, branches takes a single cycle
+
+ On the R8000, branches often take no noticable cycles, as they are
+ executed in a separate function unit..
+
+2. The R4000 and R4400 have a load latency of 4 cycles.
+
+3. On the R4000 and R4400, multiplies take a data-dependent number of
+ cycles, contrary to the SGI documentation. There seem to be 3 or 4
+ possible latencies.
+
+STATUS
+
+Good...
diff --git a/rts/gmp/mpn/mips3/add_n.s b/rts/gmp/mpn/mips3/add_n.s
new file mode 100644
index 0000000000..adad0beaef
--- /dev/null
+++ b/rts/gmp/mpn/mips3/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # s2_ptr $6
+ # size $7
+
+ .text
+ .align 2
+ .globl __gmpn_add_n
+ .ent __gmpn_add_n
+__gmpn_add_n:
+ .set noreorder
+ .set nomacro
+
+ ld $10,0($5)
+ ld $11,0($6)
+
+ daddiu $7,$7,-1
+ and $9,$7,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ move $2,$0
+
+ dsubu $7,$7,$9
+
+.Loop0: daddiu $9,$9,-1
+ ld $12,8($5)
+ daddu $11,$11,$2
+ ld $13,8($6)
+ sltu $8,$11,$2
+ daddu $11,$10,$11
+ sltu $2,$11,$10
+ sd $11,0($4)
+ or $2,$2,$8
+
+ daddiu $5,$5,8
+ daddiu $6,$6,8
+ move $10,$12
+ move $11,$13
+ bne $9,$0,.Loop0
+ daddiu $4,$4,8
+
+.L0: beq $7,$0,.Lend
+ nop
+
+.Loop: daddiu $7,$7,-4
+
+ ld $12,8($5)
+ daddu $11,$11,$2
+ ld $13,8($6)
+ sltu $8,$11,$2
+ daddu $11,$10,$11
+ sltu $2,$11,$10
+ sd $11,0($4)
+ or $2,$2,$8
+
+ ld $10,16($5)
+ daddu $13,$13,$2
+ ld $11,16($6)
+ sltu $8,$13,$2
+ daddu $13,$12,$13
+ sltu $2,$13,$12
+ sd $13,8($4)
+ or $2,$2,$8
+
+ ld $12,24($5)
+ daddu $11,$11,$2
+ ld $13,24($6)
+ sltu $8,$11,$2
+ daddu $11,$10,$11
+ sltu $2,$11,$10
+ sd $11,16($4)
+ or $2,$2,$8
+
+ ld $10,32($5)
+ daddu $13,$13,$2
+ ld $11,32($6)
+ sltu $8,$13,$2
+ daddu $13,$12,$13
+ sltu $2,$13,$12
+ sd $13,24($4)
+ or $2,$2,$8
+
+ daddiu $5,$5,32
+ daddiu $6,$6,32
+
+ bne $7,$0,.Loop
+ daddiu $4,$4,32
+
+.Lend: daddu $11,$11,$2
+ sltu $8,$11,$2
+ daddu $11,$10,$11
+ sltu $2,$11,$10
+ sd $11,0($4)
+ j $31
+ or $2,$2,$8
+
+ .end __gmpn_add_n
diff --git a/rts/gmp/mpn/mips3/addmul_1.s b/rts/gmp/mpn/mips3/addmul_1.s
new file mode 100644
index 0000000000..d390e2298e
--- /dev/null
+++ b/rts/gmp/mpn/mips3/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_addmul_1
+ .ent __gmpn_addmul_1
+__gmpn_addmul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ ld $8,0($5)
+
+ # warm up phase 1
+ daddiu $5,$5,8
+ dmultu $8,$7
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC1
+ ld $8,0($5) # load new s1 limb as early as possible
+
+Loop: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddiu $5,$5,8
+ daddu $3,$3,$2 # add old carry limb to low product limb
+ dmultu $8,$7
+ ld $8,0($5) # load new s1 limb as early as possible
+ daddiu $6,$6,-1 # decrement loop counter
+ sltu $2,$3,$2 # carry from previous addition -> $2
+ daddu $3,$10,$3
+ sltu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ daddiu $4,$4,8
+ bne $6,$0,Loop
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddu $3,$3,$2
+ sltu $2,$3,$2
+ dmultu $8,$7
+ daddu $3,$10,$3
+ sltu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ daddiu $4,$4,8
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddu $3,$3,$2
+ sltu $2,$3,$2
+ daddu $3,$10,$3
+ sltu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ j $31
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips3/gmp-mparam.h b/rts/gmp/mpn/mips3/gmp-mparam.h
new file mode 100644
index 0000000000..656e90c7b0
--- /dev/null
+++ b/rts/gmp/mpn/mips3/gmp-mparam.h
@@ -0,0 +1,58 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the R10000 usign the system cc. */
+/* Generated by tuneup.c, 2000-07-25. */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 16
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 32
+#endif
+
+/* Supressed the TOOM3 values as they looked absolutely crazy
+ (698 and 21 respectively) */
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 54
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 82
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 159
+#endif
diff --git a/rts/gmp/mpn/mips3/lshift.s b/rts/gmp/mpn/mips3/lshift.s
new file mode 100644
index 0000000000..372606fddf
--- /dev/null
+++ b/rts/gmp/mpn/mips3/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS3 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # src_ptr $5
+ # size $6
+ # cnt $7
+
+ .text
+ .align 2
+ .globl __gmpn_lshift
+ .ent __gmpn_lshift
+__gmpn_lshift:
+ .set noreorder
+ .set nomacro
+
+ dsll $2,$6,3
+ daddu $5,$5,$2 # make r5 point at end of src
+ ld $10,-8($5) # load first limb
+ dsubu $13,$0,$7
+ daddu $4,$4,$2 # make r4 point at end of res
+ daddiu $6,$6,-1
+ and $9,$6,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ dsrl $2,$10,$13 # compute function result
+
+ dsubu $6,$6,$9
+
+.Loop0: ld $3,-16($5)
+ daddiu $4,$4,-8
+ daddiu $5,$5,-8
+ daddiu $9,$9,-1
+ dsll $11,$10,$7
+ dsrl $12,$3,$13
+ move $10,$3
+ or $8,$11,$12
+ bne $9,$0,.Loop0
+ sd $8,0($4)
+
+.L0: beq $6,$0,.Lend
+ nop
+
+.Loop: ld $3,-16($5)
+ daddiu $4,$4,-32
+ daddiu $6,$6,-4
+ dsll $11,$10,$7
+ dsrl $12,$3,$13
+
+ ld $10,-24($5)
+ dsll $14,$3,$7
+ or $8,$11,$12
+ sd $8,24($4)
+ dsrl $9,$10,$13
+
+ ld $3,-32($5)
+ dsll $11,$10,$7
+ or $8,$14,$9
+ sd $8,16($4)
+ dsrl $12,$3,$13
+
+ ld $10,-40($5)
+ dsll $14,$3,$7
+ or $8,$11,$12
+ sd $8,8($4)
+ dsrl $9,$10,$13
+
+ daddiu $5,$5,-32
+ or $8,$14,$9
+ bgtz $6,.Loop
+ sd $8,0($4)
+
+.Lend: dsll $8,$10,$7
+ j $31
+ sd $8,-8($4)
+ .end __gmpn_lshift
diff --git a/rts/gmp/mpn/mips3/mul_1.s b/rts/gmp/mpn/mips3/mul_1.s
new file mode 100644
index 0000000000..6659e2b4eb
--- /dev/null
+++ b/rts/gmp/mpn/mips3/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_mul_1
+ .ent __gmpn_mul_1
+__gmpn_mul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ ld $8,0($5)
+
+ # warm up phase 1
+ daddiu $5,$5,8
+ dmultu $8,$7
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC1
+ ld $8,0($5) # load new s1 limb as early as possible
+
+Loop: mflo $10
+ mfhi $9
+ daddiu $5,$5,8
+ daddu $10,$10,$2 # add old carry limb to low product limb
+ dmultu $8,$7
+ ld $8,0($5) # load new s1 limb as early as possible
+ daddiu $6,$6,-1 # decrement loop counter
+ sltu $2,$10,$2 # carry from previous addition -> $2
+ sd $10,0($4)
+ daddiu $4,$4,8
+ bne $6,$0,Loop
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: mflo $10
+ mfhi $9
+ daddu $10,$10,$2
+ sltu $2,$10,$2
+ dmultu $8,$7
+ sd $10,0($4)
+ daddiu $4,$4,8
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: mflo $10
+ mfhi $9
+ daddu $10,$10,$2
+ sltu $2,$10,$2
+ sd $10,0($4)
+ j $31
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_mul_1
diff --git a/rts/gmp/mpn/mips3/rshift.s b/rts/gmp/mpn/mips3/rshift.s
new file mode 100644
index 0000000000..59c7fd3492
--- /dev/null
+++ b/rts/gmp/mpn/mips3/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS3 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # src_ptr $5
+ # size $6
+ # cnt $7
+
+ .text
+ .align 2
+ .globl __gmpn_rshift
+ .ent __gmpn_rshift
+__gmpn_rshift:
+ .set noreorder
+ .set nomacro
+
+ ld $10,0($5) # load first limb
+ dsubu $13,$0,$7
+ daddiu $6,$6,-1
+ and $9,$6,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ dsll $2,$10,$13 # compute function result
+
+ dsubu $6,$6,$9
+
+.Loop0: ld $3,8($5)
+ daddiu $4,$4,8
+ daddiu $5,$5,8
+ daddiu $9,$9,-1
+ dsrl $11,$10,$7
+ dsll $12,$3,$13
+ move $10,$3
+ or $8,$11,$12
+ bne $9,$0,.Loop0
+ sd $8,-8($4)
+
+.L0: beq $6,$0,.Lend
+ nop
+
+.Loop: ld $3,8($5)
+ daddiu $4,$4,32
+ daddiu $6,$6,-4
+ dsrl $11,$10,$7
+ dsll $12,$3,$13
+
+ ld $10,16($5)
+ dsrl $14,$3,$7
+ or $8,$11,$12
+ sd $8,-32($4)
+ dsll $9,$10,$13
+
+ ld $3,24($5)
+ dsrl $11,$10,$7
+ or $8,$14,$9
+ sd $8,-24($4)
+ dsll $12,$3,$13
+
+ ld $10,32($5)
+ dsrl $14,$3,$7
+ or $8,$11,$12
+ sd $8,-16($4)
+ dsll $9,$10,$13
+
+ daddiu $5,$5,32
+ or $8,$14,$9
+ bgtz $6,.Loop
+ sd $8,-8($4)
+
+.Lend: dsrl $8,$10,$7
+ j $31
+ sd $8,0($4)
+ .end __gmpn_rshift
diff --git a/rts/gmp/mpn/mips3/sub_n.s b/rts/gmp/mpn/mips3/sub_n.s
new file mode 100644
index 0000000000..c57c824b04
--- /dev/null
+++ b/rts/gmp/mpn/mips3/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # s2_ptr $6
+ # size $7
+
+ .text
+ .align 2
+ .globl __gmpn_sub_n
+ .ent __gmpn_sub_n
+__gmpn_sub_n:
+ .set noreorder
+ .set nomacro
+
+ ld $10,0($5)
+ ld $11,0($6)
+
+ daddiu $7,$7,-1
+ and $9,$7,4-1 # number of limbs in first loop
+ beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
+ move $2,$0
+
+ dsubu $7,$7,$9
+
+.Loop0: daddiu $9,$9,-1
+ ld $12,8($5)
+ daddu $11,$11,$2
+ ld $13,8($6)
+ sltu $8,$11,$2
+ dsubu $11,$10,$11
+ sltu $2,$10,$11
+ sd $11,0($4)
+ or $2,$2,$8
+
+ daddiu $5,$5,8
+ daddiu $6,$6,8
+ move $10,$12
+ move $11,$13
+ bne $9,$0,.Loop0
+ daddiu $4,$4,8
+
+.L0: beq $7,$0,.Lend
+ nop
+
+.Loop: daddiu $7,$7,-4
+
+ ld $12,8($5)
+ daddu $11,$11,$2
+ ld $13,8($6)
+ sltu $8,$11,$2
+ dsubu $11,$10,$11
+ sltu $2,$10,$11
+ sd $11,0($4)
+ or $2,$2,$8
+
+ ld $10,16($5)
+ daddu $13,$13,$2
+ ld $11,16($6)
+ sltu $8,$13,$2
+ dsubu $13,$12,$13
+ sltu $2,$12,$13
+ sd $13,8($4)
+ or $2,$2,$8
+
+ ld $12,24($5)
+ daddu $11,$11,$2
+ ld $13,24($6)
+ sltu $8,$11,$2
+ dsubu $11,$10,$11
+ sltu $2,$10,$11
+ sd $11,16($4)
+ or $2,$2,$8
+
+ ld $10,32($5)
+ daddu $13,$13,$2
+ ld $11,32($6)
+ sltu $8,$13,$2
+ dsubu $13,$12,$13
+ sltu $2,$12,$13
+ sd $13,24($4)
+ or $2,$2,$8
+
+ daddiu $5,$5,32
+ daddiu $6,$6,32
+
+ bne $7,$0,.Loop
+ daddiu $4,$4,32
+
+.Lend: daddu $11,$11,$2
+ sltu $8,$11,$2
+ dsubu $11,$10,$11
+ sltu $2,$10,$11
+ sd $11,0($4)
+ j $31
+ or $2,$2,$8
+
+ .end __gmpn_sub_n
diff --git a/rts/gmp/mpn/mips3/submul_1.s b/rts/gmp/mpn/mips3/submul_1.s
new file mode 100644
index 0000000000..531f9705a6
--- /dev/null
+++ b/rts/gmp/mpn/mips3/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $4
+ # s1_ptr $5
+ # size $6
+ # s2_limb $7
+
+ .text
+ .align 4
+ .globl __gmpn_submul_1
+ .ent __gmpn_submul_1
+__gmpn_submul_1:
+ .set noreorder
+ .set nomacro
+
+ # warm up phase 0
+ ld $8,0($5)
+
+ # warm up phase 1
+ daddiu $5,$5,8
+ dmultu $8,$7
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC0
+ move $2,$0 # zero cy2
+
+ daddiu $6,$6,-1
+ beq $6,$0,$LC1
+ ld $8,0($5) # load new s1 limb as early as possible
+
+Loop: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddiu $5,$5,8
+ daddu $3,$3,$2 # add old carry limb to low product limb
+ dmultu $8,$7
+ ld $8,0($5) # load new s1 limb as early as possible
+ daddiu $6,$6,-1 # decrement loop counter
+ sltu $2,$3,$2 # carry from previous addition -> $2
+ dsubu $3,$10,$3
+ sgtu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ daddiu $4,$4,8
+ bne $6,$0,Loop
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddu $3,$3,$2
+ sltu $2,$3,$2
+ dmultu $8,$7
+ dsubu $3,$10,$3
+ sgtu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ daddiu $4,$4,8
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0: ld $10,0($4)
+ mflo $3
+ mfhi $9
+ daddu $3,$3,$2
+ sltu $2,$3,$2
+ dsubu $3,$10,$3
+ sgtu $10,$3,$10
+ daddu $2,$2,$10
+ sd $3,0($4)
+ j $31
+ daddu $2,$9,$2 # add high product limb and carry from addition
+
+ .end __gmpn_submul_1
diff --git a/rts/gmp/mpn/mp_bases.c b/rts/gmp/mpn/mp_bases.c
new file mode 100644
index 0000000000..011c328c80
--- /dev/null
+++ b/rts/gmp/mpn/mp_bases.c
@@ -0,0 +1,550 @@
+/* __mp_bases -- Structure for conversion between internal binary
+ format and strings in base 2..255. The fields are explained in
+ gmp-impl.h.
+
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#if BITS_PER_MP_LIMB == 32
+const struct bases __mp_bases[256] =
+{
+ /* 0 */ {0, 0.0, 0, 0},
+ /* 1 */ {0, 1e38, 0, 0},
+ /* 2 */ {32, 1.0000000000000000, 0x1, 0x0},
+ /* 3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24},
+ /* 4 */ {16, 0.5000000000000000, 0x2, 0x0},
+ /* 5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684},
+ /* 6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6},
+ /* 7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb},
+ /* 8 */ {10, 0.3333333333333334, 0x3, 0x0},
+ /* 9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24},
+ /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82},
+ /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04},
+ /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5},
+ /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f},
+ /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e},
+ /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2},
+ /* 16 */ {8, 0.2500000000000000, 0x4, 0x0},
+ /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd},
+ /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1},
+ /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942},
+ /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab},
+ /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15},
+ /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0},
+ /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9},
+ /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b},
+ /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812},
+ /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396},
+ /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9},
+ /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43},
+ /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea},
+ /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511},
+ /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0},
+ /* 32 */ {6, 0.2000000000000000, 0x5, 0x0},
+ /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3},
+ /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229},
+ /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30},
+ /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6},
+ /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9},
+ /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe},
+ /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9},
+ /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b},
+ /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154},
+ /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d},
+ /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0},
+ /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895},
+ /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5},
+ /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f},
+ /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582},
+ /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88},
+ /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4},
+ /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d},
+ /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933},
+ /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea},
+ /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c},
+ /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16},
+ /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6},
+ /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36},
+ /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19},
+ /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4},
+ /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3},
+ /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf},
+ /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe},
+ /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273},
+ /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805},
+ /* 64 */ {5, 0.1666666666666667, 0x6, 0x0},
+ /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829},
+ /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841},
+ /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb},
+ /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b},
+ /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f},
+ /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84},
+ /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635},
+ /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6},
+ /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a},
+ /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c},
+ /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2},
+ /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e},
+ /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e},
+ /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d},
+ /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761},
+ /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e},
+ /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24},
+ /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154},
+ /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0},
+ /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d},
+ /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c},
+ /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b},
+ /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd},
+ /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7},
+ /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd},
+ /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68},
+ /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2},
+ /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316},
+ /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684},
+ /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97},
+ /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269},
+ /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd},
+ /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215},
+ /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e},
+ /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e},
+ /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23},
+ /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b},
+ /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94},
+ /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0},
+ /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e},
+ /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb},
+ /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2},
+ /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6},
+ /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6},
+ /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a},
+ /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e},
+ /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2},
+ /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf},
+ /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef},
+ /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a},
+ /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509},
+ /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3},
+ /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595},
+ /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db},
+ /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07},
+ /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33},
+ /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3},
+ /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5},
+ /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0},
+ /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f},
+ /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812},
+ /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5},
+ /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237},
+ /* 128 */ {4, 0.1428571428571428, 0x7, 0x0},
+ /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452},
+ /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a},
+ /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589},
+ /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83},
+ /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5},
+ /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5},
+ /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729},
+ /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30},
+ /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887},
+ /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c},
+ /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0},
+ /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1},
+ /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502},
+ /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23},
+ /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19},
+ /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5},
+ /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912},
+ /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140},
+ /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b},
+ /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea},
+ /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d},
+ /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87},
+ /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492},
+ /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4},
+ /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb},
+ /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db},
+ /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5},
+ /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1},
+ /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37},
+ /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c},
+ /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294},
+ /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1},
+ /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094},
+ /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e},
+ /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7},
+ /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4},
+ /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8},
+ /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498},
+ /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154},
+ /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e},
+ /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f},
+ /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c},
+ /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8},
+ /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b},
+ /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622},
+ /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd},
+ /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58},
+ /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7},
+ /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0},
+ /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd},
+ /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e},
+ /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68},
+ /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c},
+ /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2},
+ /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3},
+ /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316},
+ /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9},
+ /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684},
+ /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f},
+ /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97},
+ /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724},
+ /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269},
+ /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d},
+ /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd},
+ /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d},
+ /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215},
+ /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8},
+ /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e},
+ /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d},
+ /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e},
+ /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64},
+ /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23},
+ /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde},
+ /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b},
+ /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd},
+ /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94},
+ /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee},
+ /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0},
+ /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75},
+ /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e},
+ /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5},
+ /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb},
+ /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f},
+ /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2},
+ /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9},
+ /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6},
+ /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef},
+ /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6},
+ /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7},
+ /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a},
+ /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a},
+ /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e},
+ /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50},
+ /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2},
+ /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9},
+ /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf},
+ /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2},
+ /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef},
+ /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9},
+ /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a},
+ /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41},
+ /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509},
+ /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d},
+ /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3},
+ /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9},
+ /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595},
+ /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923},
+ /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db},
+ /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b},
+ /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07},
+ /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b},
+ /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33},
+ /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c},
+ /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3},
+ /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24},
+ /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5},
+ /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3},
+ /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0},
+ /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45},
+ /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f},
+ /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c},
+ /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812},
+ /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33},
+ /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5},
+ /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749},
+ /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237},
+ /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423},
+};
+#endif
+#if BITS_PER_MP_LIMB == 64
+const struct bases __mp_bases[256] =
+{
+ /* 0 */ {0, 0.0, 0, 0},
+ /* 1 */ {0, 1e38, 0, 0},
+ /* 2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)},
+ /* 3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+ /* 4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)},
+ /* 5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+ /* 6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+ /* 7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+ /* 8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)},
+ /* 9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+ /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)},
+ /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+ /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)},
+ /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)},
+ /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+ /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+ /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)},
+ /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)},
+ /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)},
+ /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)},
+ /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)},
+ /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)},
+ /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)},
+ /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)},
+ /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)},
+ /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)},
+ /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)},
+ /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)},
+ /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)},
+ /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)},
+ /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)},
+ /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)},
+ /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)},
+ /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)},
+ /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)},
+ /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)},
+ /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+ /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)},
+ /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)},
+ /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)},
+ /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)},
+ /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)},
+ /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)},
+ /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)},
+ /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)},
+ /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)},
+ /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)},
+ /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)},
+ /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)},
+ /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+ /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)},
+ /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)},
+ /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)},
+ /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)},
+ /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)},
+ /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)},
+ /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)},
+ /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)},
+ /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)},
+ /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)},
+ /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)},
+ /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)},
+ /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)},
+ /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)},
+ /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)},
+ /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)},
+ /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)},
+ /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)},
+ /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)},
+ /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)},
+ /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)},
+ /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)},
+ /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)},
+ /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)},
+ /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)},
+ /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)},
+ /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)},
+ /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)},
+ /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)},
+ /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)},
+ /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)},
+ /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+ /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)},
+ /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)},
+ /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)},
+ /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)},
+ /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)},
+ /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)},
+ /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)},
+ /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)},
+ /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)},
+ /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)},
+ /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)},
+ /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)},
+ /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)},
+ /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)},
+ /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)},
+ /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)},
+ /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)},
+ /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)},
+ /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)},
+ /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)},
+ /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)},
+ /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)},
+ /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)},
+ /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)},
+ /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)},
+ /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)},
+ /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)},
+ /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)},
+ /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)},
+ /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)},
+ /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)},
+ /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)},
+ /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)},
+ /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)},
+ /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)},
+ /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)},
+ /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)},
+ /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)},
+ /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)},
+ /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+ /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)},
+ /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)},
+ /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)},
+ /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+ /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)},
+ /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)},
+ /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)},
+ /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)},
+ /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)},
+ /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)},
+ /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)},
+ /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)},
+ /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)},
+ /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)},
+ /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)},
+ /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)},
+ /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)},
+ /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)},
+ /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)},
+ /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)},
+ /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)},
+ /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)},
+ /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)},
+ /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)},
+ /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)},
+ /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)},
+ /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)},
+ /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)},
+ /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)},
+ /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)},
+ /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)},
+ /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)},
+ /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)},
+ /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)},
+ /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)},
+ /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)},
+ /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)},
+ /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)},
+ /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)},
+ /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)},
+ /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)},
+ /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)},
+ /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)},
+ /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)},
+ /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)},
+ /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)},
+ /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)},
+ /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)},
+ /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)},
+ /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)},
+ /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)},
+ /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)},
+ /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)},
+ /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)},
+ /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)},
+ /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)},
+ /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)},
+ /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)},
+ /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)},
+ /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)},
+ /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)},
+ /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)},
+ /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)},
+ /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)},
+ /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)},
+ /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)},
+ /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)},
+ /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)},
+ /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)},
+ /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)},
+ /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)},
+ /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)},
+ /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)},
+ /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)},
+ /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+ /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)},
+ /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)},
+ /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)},
+ /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)},
+ /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)},
+ /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)},
+ /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)},
+ /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)},
+ /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)},
+ /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)},
+ /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)},
+ /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)},
+ /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)},
+ /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)},
+ /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)},
+ /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)},
+ /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)},
+ /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)},
+ /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)},
+ /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+ /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)},
+ /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)},
+ /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)},
+ /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)},
+ /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)},
+ /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)},
+ /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)},
+ /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)},
+ /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+ /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)},
+ /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)},
+ /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)},
+ /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)},
+ /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)},
+ /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)},
+ /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)},
+ /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)},
+ /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)},
+ /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)},
+ /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)},
+ /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)},
+ /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)},
+ /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)},
+ /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)},
+ /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)},
+ /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)},
+ /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+ /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)},
+ /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)},
+ /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)},
+ /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)},
+ /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)},
+ /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)},
+ /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)},
+ /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)},
+ /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)},
+ /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)},
+ /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)},
+ /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)},
+};
+#endif
diff --git a/rts/gmp/mpn/ns32k/add_n.s b/rts/gmp/mpn/ns32k/add_n.s
new file mode 100644
index 0000000000..bd063d07d9
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/add_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+ .align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ save [r3,r4,r5]
+ negd 28(sp),r3
+ movd r3,r0
+ lshd 2,r0
+ movd 24(sp),r4
+ subd r0,r4 # r4 -> to end of S2
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r2
+ subd r0,r2 # r2 -> to end of RES
+ subd r0,r0 # cy = 0
+
+Loop: movd r5[r3:d],r0
+ addcd r4[r3:d],r0
+ movd r0,r2[r3:d]
+ acbd 1,r3,Loop
+
+ scsd r0 # r0 = cy.
+ restore [r5,r4,r3]
+ ret 0
diff --git a/rts/gmp/mpn/ns32k/addmul_1.s b/rts/gmp/mpn/ns32k/addmul_1.s
new file mode 100644
index 0000000000..df0dcdd4af
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/addmul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+ .align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ addcd 0,r0
+ addd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/rts/gmp/mpn/ns32k/mul_1.s b/rts/gmp/mpn/ns32k/mul_1.s
new file mode 100644
index 0000000000..0a77efba29
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/mul_1.s
@@ -0,0 +1,47 @@
+# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+ .align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ movd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/rts/gmp/mpn/ns32k/sub_n.s b/rts/gmp/mpn/ns32k/sub_n.s
new file mode 100644
index 0000000000..cd89f4fd3f
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/sub_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+ .align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ save [r3,r4,r5]
+ negd 28(sp),r3
+ movd r3,r0
+ lshd 2,r0
+ movd 24(sp),r4
+ subd r0,r4 # r4 -> to end of S2
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r2
+ subd r0,r2 # r2 -> to end of RES
+ subd r0,r0 # cy = 0
+
+Loop: movd r5[r3:d],r0
+ subcd r4[r3:d],r0
+ movd r0,r2[r3:d]
+ acbd 1,r3,Loop
+
+ scsd r0 # r0 = cy.
+ restore [r5,r4,r3]
+ ret 0
diff --git a/rts/gmp/mpn/ns32k/submul_1.s b/rts/gmp/mpn/ns32k/submul_1.s
new file mode 100644
index 0000000000..f811aedcf1
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/submul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+ .align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ addcd 0,r0
+ subd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README
new file mode 100644
index 0000000000..8d2976dabc
--- /dev/null
+++ b/rts/gmp/mpn/pa64/README
@@ -0,0 +1,38 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The PA8000 has a multi-issue pipeline with large buffers for instructions
+awaiting pending results. Therefore, no latency scheduling is necessary
+(and might actually be harmful).
+
+Two 64-bit loads can be completed per cycle. One 64-bit store can be
+completed per cycle. A store cannot complete in the same cycle as a load.
+
+STATUS
+
+* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at
+ the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb
+ for add/subtract.
+
+* The multiplication functions run at 11 cycles/limb. The cache bandwidth
+ allows 7.5 cycles/limb. Perhaps it would be possible, using unrolling or
+ better scheduling, to get closer to the cache bandwidth limit.
+
+* xaddmul_1.S contains a quicker method for forming the 128 bit product. It
+ uses some fewer operations, and keep the carry flag live across the loop
+ boundary. But it seems hard to make it run more than 1/4 cycle faster
+ than the old code. Perhaps we really ought to unroll this loop be 2x?
+ 2x should suffice since register latency schedling is never needed,
+ but the unrolling would hide the store-load latency. Here is a sketch:
+
+ 1. A multiply and store 64-bit products
+ 2. B sum 64-bit products 128-bit product
+ 3. B load 64-bit products to integer registers
+ 4. B multiply and store 64-bit products
+ 5. A sum 64-bit products 128-bit product
+ 6. A load 64-bit products to integer registers
+ 7. goto 1
+
+ In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved
+ for better instruction mix.
diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s
new file mode 100644
index 0000000000..22ff19c184
--- /dev/null
+++ b/rts/gmp/mpn/pa64/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .level 2.0n
+ .code
+ .export __gmpn_add_n,entry
+__gmpn_add_n
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ sub %r0,%r23,%r22
+ depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ sub %r26,%r22,%r26 ; offset res_ptr
+ blr %r28,%r0 ; branch into loop
+ add %r0,%r0,%r0 ; reset carry
+
+L$loop ldd 0(%r25),%r20
+ ldd 0(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,0(%r26)
+L$7 ldd 8(%r25),%r21
+ ldd 8(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,8(%r26)
+L$6 ldd 16(%r25),%r20
+ ldd 16(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,16(%r26)
+L$5 ldd 24(%r25),%r21
+ ldd 24(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,24(%r26)
+L$4 ldd 32(%r25),%r20
+ ldd 32(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,32(%r26)
+L$3 ldd 40(%r25),%r21
+ ldd 40(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,40(%r26)
+L$2 ldd 48(%r25),%r20
+ ldd 48(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,48(%r26)
+L$1 ldd 56(%r25),%r21
+ ldo 64(%r25),%r25
+ ldd 56(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,56(%r26)
+ ldo 64(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 64(%r26),%r26
+
+ add,dc %r0,%r0,%r29
+ bve (%r2)
+ .exit
+ ldi 0,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S
new file mode 100644
index 0000000000..b1885b432c
--- /dev/null
+++ b/rts/gmp/mpn/pa64/addmul_1.S
@@ -0,0 +1,167 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb -56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0n
+ .code
+ .export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ fldd -56(%r30),%fr5 ; s2limb passed on stack
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,rlimb,rlimb
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ add t4,rlimb,t3
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,rlimb,rlimb
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ add t4,rlimb,t3
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,rlimb,rlimb
+ add,dc t2,hi,cylimb
+ add t4,rlimb,t3
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ extrd,u cylimb,31,32,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000000..847735b987
--- /dev/null
+++ b/rts/gmp/mpn/pa64/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured in a PA8000 using the system compiler version
+ A.10.32.30. Presumably the PA8200 and PA8500 have the same timing
+ characteristic, but GCC might give somewhat different results. */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 16
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 40
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 72
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 94
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 50
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 1
+#endif
diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s
new file mode 100644
index 0000000000..994bc1c4d6
--- /dev/null
+++ b/rts/gmp/mpn/pa64/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; size gr24
+; cnt gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+ .level 2.0n
+ .code
+ .export __gmpn_lshift,entry
+__gmpn_lshift
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ shladd %r24,3,%r25,%r25
+ shladd %r24,3,%r26,%r26
+ subi 64,%r23,%r23
+ mtsar %r23
+ ldd -8(%r25),%r21
+ addib,= -1,%r24,L$end
+ shrpd %r0,%r21,%sar,%r29 ; compute carry out limb
+ depw,z %r24,31,3,%r28 ; r28 = (size & 7)
+ sub %r0,%r24,%r22
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7)
+ add %r25,%r22,%r25 ; offset s1_ptr
+ blr %r28,%r0 ; branch into jump table
+ add %r26,%r22,%r26 ; offset res_ptr
+ b L$0
+ nop
+ b L$1
+ copy %r21,%r20
+ b L$2
+ nop
+ b L$3
+ copy %r21,%r20
+ b L$4
+ nop
+ b L$5
+ copy %r21,%r20
+ b L$6
+ nop
+ b L$7
+ copy %r21,%r20
+
+L$loop
+L$0 ldd -16(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-8(%r26)
+L$7 ldd -24(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-16(%r26)
+L$6 ldd -32(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-24(%r26)
+L$5 ldd -40(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-32(%r26)
+L$4 ldd -48(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-40(%r26)
+L$3 ldd -56(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-48(%r26)
+L$2 ldd -64(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-56(%r26)
+L$1 ldd -72(%r25),%r21
+ ldo -64(%r25),%r25
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-64(%r26)
+ addib,> -8,%r24,L$loop
+ ldo -64(%r26),%r26
+
+L$end shrpd %r21,%r0,%sar,%r21
+ std %r21,-8(%r26)
+ bve (%r2)
+ .exit
+ extrd,u %r29,31,32,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S
new file mode 100644
index 0000000000..ab310c1264
--- /dev/null
+++ b/rts/gmp/mpn/pa64/mul_1.S
@@ -0,0 +1,158 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb -56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0n
+ .code
+ .export __gmpn_mul_1,entry
+__gmpn_mul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ fldd -56(%r30),%fr5 ; s2limb passed on stack
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t3
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t3
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t2 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,t4,t3
+ add,dc t2,hi,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ extrd,u cylimb,31,32,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s
new file mode 100644
index 0000000000..f0730e2a91
--- /dev/null
+++ b/rts/gmp/mpn/pa64/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; size gr24
+; cnt gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+ .level 2.0n
+ .code
+ .export __gmpn_rshift,entry
+__gmpn_rshift
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ mtsar %r23
+ ldd 0(%r25),%r21
+ addib,= -1,%r24,L$end
+ shrpd %r21,%r0,%sar,%r29 ; compute carry out limb
+ depw,z %r24,31,3,%r28 ; r28 = (size & 7)
+ sub %r0,%r24,%r22
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ blr %r28,%r0 ; branch into jump table
+ sub %r26,%r22,%r26 ; offset res_ptr
+ b L$0
+ nop
+ b L$1
+ copy %r21,%r20
+ b L$2
+ nop
+ b L$3
+ copy %r21,%r20
+ b L$4
+ nop
+ b L$5
+ copy %r21,%r20
+ b L$6
+ nop
+ b L$7
+ copy %r21,%r20
+
+L$loop
+L$0 ldd 8(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,0(%r26)
+L$7 ldd 16(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,8(%r26)
+L$6 ldd 24(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,16(%r26)
+L$5 ldd 32(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,24(%r26)
+L$4 ldd 40(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,32(%r26)
+L$3 ldd 48(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,40(%r26)
+L$2 ldd 56(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,48(%r26)
+L$1 ldd 64(%r25),%r21
+ ldo 64(%r25),%r25
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,56(%r26)
+ addib,> -8,%r24,L$loop
+ ldo 64(%r26),%r26
+
+L$end shrpd %r0,%r21,%sar,%r21
+ std %r21,0(%r26)
+ bve (%r2)
+ .exit
+ extrd,u %r29,31,32,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s
new file mode 100644
index 0000000000..dda1f54b34
--- /dev/null
+++ b/rts/gmp/mpn/pa64/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .level 2.0n
+ .code
+ .export __gmpn_sub_n,entry
+__gmpn_sub_n
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ sub %r0,%r23,%r22
+ depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ blr %r28,%r0 ; branch into loop
+ sub %r26,%r22,%r26 ; offset res_ptr and set carry
+
+L$loop ldd 0(%r25),%r20
+ ldd 0(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,0(%r26)
+L$7 ldd 8(%r25),%r21
+ ldd 8(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,8(%r26)
+L$6 ldd 16(%r25),%r20
+ ldd 16(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,16(%r26)
+L$5 ldd 24(%r25),%r21
+ ldd 24(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,24(%r26)
+L$4 ldd 32(%r25),%r20
+ ldd 32(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,32(%r26)
+L$3 ldd 40(%r25),%r21
+ ldd 40(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,40(%r26)
+L$2 ldd 48(%r25),%r20
+ ldd 48(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,48(%r26)
+L$1 ldd 56(%r25),%r21
+ ldo 64(%r25),%r25
+ ldd 56(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,56(%r26)
+ ldo 64(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 64(%r26),%r26
+
+ add,dc %r0,%r0,%r29
+ subi 1,%r29,%r29
+ bve (%r2)
+ .exit
+ ldi 0,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S
new file mode 100644
index 0000000000..27666b99df
--- /dev/null
+++ b/rts/gmp/mpn/pa64/submul_1.S
@@ -0,0 +1,170 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb -56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0n
+ .code
+ .export __gmpn_submul_1,entry
+__gmpn_submul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ fldd -56(%r30),%fr5 ; s2limb passed on stack
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t4
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t4
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,t4,t4
+ add,dc t2,hi,cylimb
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ extrd,u cylimb,31,32,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c
new file mode 100644
index 0000000000..1c9fe084db
--- /dev/null
+++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+ mp_limb_t n1;
+ mp_limb_t n0;
+ mp_limb_t d;
+ mp_limb_t *r;
+#endif
+{
+ mp_limb_t q1, q2, q;
+ mp_limb_t p1, p0;
+ double di, dq;
+
+ di = 1.0 / d;
+
+ /* Generate upper 53 bits of quotient. Be careful here; the `double'
+ quotient may be rounded to 2^64 which we cannot safely convert back
+ to a 64-bit integer. */
+ dq = (TWO64 * (double) n1 + (double) n0) * di;
+ if (dq >= TWO64)
+ q1 = 0xfffffffffffff800LL;
+ else
+ q1 = (mp_limb_t) dq;
+
+ /* Multiply back in order to compare the product to the dividend. */
+ umul_ppmm (p1, p0, q1, d);
+
+ /* Was the 53-bit quotient greater that our sought quotient? Test the
+ sign of the partial remainder to find out. */
+ if (n1 < p1 || (n1 == p1 && n0 < p0))
+ {
+ /* 53-bit quotient too large. Partial remainder is negative.
+ Compute the absolute value of the remainder in n1,,n0. */
+ n1 = p1 - (n1 + (p0 < n0));
+ n0 = p0 - n0;
+
+ /* Now use the partial remainder as new dividend to compute more bits of
+ quotient. This is an adjustment for the one we got previously. */
+ q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+ umul_ppmm (p1, p0, q2, d);
+
+ q = q1 - q2;
+ if (n1 < p1 || (n1 == p1 && n0 <= p0))
+ {
+ n0 = p0 - n0;
+ }
+ else
+ {
+ n0 = p0 - n0;
+ n0 += d;
+ q--;
+ }
+ }
+ else
+ {
+ n1 = n1 - (p1 + (n0 < p0));
+ n0 = n0 - p0;
+
+ q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+ umul_ppmm (p1, p0, q2, d);
+
+ q = q1 + q2;
+ if (n1 < p1 || (n1 == p1 && n0 < p0))
+ {
+ n0 = n0 - p0;
+ n0 += d;
+ q--;
+ }
+ else
+ {
+ n0 = n0 - p0;
+ if (n0 >= d)
+ {
+ n0 -= d;
+ q++;
+ }
+ }
+ }
+
+ *r = n0;
+ return q;
+}
diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S
new file mode 100644
index 0000000000..ceff2d752f
--- /dev/null
+++ b/rts/gmp/mpn/pa64/umul_ppmm.S
@@ -0,0 +1,74 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0 %r28
+#define p1 %r29
+#define t32 %r19
+#define t0 %r20
+#define t1 %r21
+#define x %r22
+#define m0 %r23
+#define m1 %r24
+ .level 2.0n
+ .code
+ .export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ ldo 128(%r30),%r30
+ depd %r25,31,32,%r26
+ std %r26,-64(%r30)
+ depd %r23,31,32,%r24
+ std %r24,-56(%r30)
+
+ ldw -180(%r30),%r31
+
+ fldd -64(%r30),%fr4
+ fldd -56(%r30),%fr5
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+
+ depdi,z 1,31,1,t32 ; t32 = 2^32
+
+ ldd -128(%r30),p0 ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),p1 ; hi = high 64 bit of product
+
+ add,l,*nuv m0,m1,x ; x = m1+m0
+ add,l t32,p1,p1 ; propagate carry to mid of p1
+ depd,z x,31,32,t0 ; lo32(m1+m0)
+ add t0,p0,p0
+ extrd,u x,31,32,t1 ; hi32(m1+m0)
+ add,dc t1,p1,p1
+
+ std p0,0(%r31) ; store low half of product
+ extrd,u p1,31,32,%r28 ; return high half of product
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64w/README b/rts/gmp/mpn/pa64w/README
new file mode 100644
index 0000000000..cf590a7b98
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/README
@@ -0,0 +1,2 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0
+using 64-bit pointers (2.0W).
diff --git a/rts/gmp/mpn/pa64w/add_n.s b/rts/gmp/mpn/pa64w/add_n.s
new file mode 100644
index 0000000000..1bb9e8fbc7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .level 2.0w
+ .code
+ .export __gmpn_add_n,entry
+__gmpn_add_n
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ sub %r0,%r23,%r22
+ depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ sub %r26,%r22,%r26 ; offset res_ptr
+ blr %r28,%r0 ; branch into loop
+ add %r0,%r0,%r0 ; reset carry
+
+L$loop ldd 0(%r25),%r20
+ ldd 0(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,0(%r26)
+L$7 ldd 8(%r25),%r21
+ ldd 8(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,8(%r26)
+L$6 ldd 16(%r25),%r20
+ ldd 16(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,16(%r26)
+L$5 ldd 24(%r25),%r21
+ ldd 24(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,24(%r26)
+L$4 ldd 32(%r25),%r20
+ ldd 32(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,32(%r26)
+L$3 ldd 40(%r25),%r21
+ ldd 40(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,40(%r26)
+L$2 ldd 48(%r25),%r20
+ ldd 48(%r24),%r31
+ add,dc %r20,%r31,%r20
+ std %r20,48(%r26)
+L$1 ldd 56(%r25),%r21
+ ldo 64(%r25),%r25
+ ldd 56(%r24),%r19
+ add,dc %r21,%r19,%r21
+ std %r21,56(%r26)
+ ldo 64(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 64(%r26),%r26
+
+ add,dc %r0,%r0,%r29
+ bve (%r2)
+ .exit
+ copy %r29,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S
new file mode 100644
index 0000000000..4799f90fc5
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/addmul_1.S
@@ -0,0 +1,168 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb %r23
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0w
+ .code
+ .export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ std s2limb,-56(%r30)
+ fldd -56(%r30),%fr5
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,rlimb,rlimb
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ add t4,rlimb,t3
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,rlimb,rlimb
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ add t4,rlimb,t3
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,rlimb,rlimb
+ add,dc t2,hi,cylimb
+ add t4,rlimb,t3
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ copy cylimb,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64w/gmp-mparam.h b/rts/gmp/mpn/pa64w/gmp-mparam.h
new file mode 100644
index 0000000000..ee5a0a3ab7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured on a PA8500 using the system compiler version
+ A.11.01.02. Presumably the PA8000 and PA8200 have the same timing
+ characteristic, but GCC might give somewhat different results.. */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 46
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 83
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 134
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 56
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 26
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 1
+#endif
diff --git a/rts/gmp/mpn/pa64w/lshift.s b/rts/gmp/mpn/pa64w/lshift.s
new file mode 100644
index 0000000000..84f925a105
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; size gr24
+; cnt gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+ .level 2.0w
+ .code
+ .export __gmpn_lshift,entry
+__gmpn_lshift
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ shladd %r24,3,%r25,%r25
+ shladd %r24,3,%r26,%r26
+ subi 64,%r23,%r23
+ mtsar %r23
+ ldd -8(%r25),%r21
+ addib,= -1,%r24,L$end
+ shrpd %r0,%r21,%sar,%r29 ; compute carry out limb
+ depw,z %r24,31,3,%r28 ; r28 = (size & 7)
+ sub %r0,%r24,%r22
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7)
+ add %r25,%r22,%r25 ; offset s1_ptr
+ blr %r28,%r0 ; branch into jump table
+ add %r26,%r22,%r26 ; offset res_ptr
+ b L$0
+ nop
+ b L$1
+ copy %r21,%r20
+ b L$2
+ nop
+ b L$3
+ copy %r21,%r20
+ b L$4
+ nop
+ b L$5
+ copy %r21,%r20
+ b L$6
+ nop
+ b L$7
+ copy %r21,%r20
+
+L$loop
+L$0 ldd -16(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-8(%r26)
+L$7 ldd -24(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-16(%r26)
+L$6 ldd -32(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-24(%r26)
+L$5 ldd -40(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-32(%r26)
+L$4 ldd -48(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-40(%r26)
+L$3 ldd -56(%r25),%r21
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-48(%r26)
+L$2 ldd -64(%r25),%r20
+ shrpd %r21,%r20,%sar,%r21
+ std %r21,-56(%r26)
+L$1 ldd -72(%r25),%r21
+ ldo -64(%r25),%r25
+ shrpd %r20,%r21,%sar,%r20
+ std %r20,-64(%r26)
+ addib,> -8,%r24,L$loop
+ ldo -64(%r26),%r26
+
+L$end shrpd %r21,%r0,%sar,%r21
+ std %r21,-8(%r26)
+ bve (%r2)
+ .exit
+ copy %r29,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64w/mul_1.S b/rts/gmp/mpn/pa64w/mul_1.S
new file mode 100644
index 0000000000..48f13fbd1b
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/mul_1.S
@@ -0,0 +1,159 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb %r23
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0w
+ .code
+ .export __gmpn_mul_1,entry
+__gmpn_mul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ std s2limb,-56(%r30)
+ fldd -56(%r30),%fr5
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t3
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t3
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t2 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,t4,t3
+ add,dc t2,hi,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ copy cylimb,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64w/rshift.s b/rts/gmp/mpn/pa64w/rshift.s
new file mode 100644
index 0000000000..2517cb1f87
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; size gr24
+; cnt gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+ .level 2.0w
+ .code
+ .export __gmpn_rshift,entry
+__gmpn_rshift
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ mtsar %r23
+ ldd 0(%r25),%r21
+ addib,= -1,%r24,L$end
+ shrpd %r21,%r0,%sar,%r29 ; compute carry out limb
+ depw,z %r24,31,3,%r28 ; r28 = (size & 7)
+ sub %r0,%r24,%r22
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ blr %r28,%r0 ; branch into jump table
+ sub %r26,%r22,%r26 ; offset res_ptr
+ b L$0
+ nop
+ b L$1
+ copy %r21,%r20
+ b L$2
+ nop
+ b L$3
+ copy %r21,%r20
+ b L$4
+ nop
+ b L$5
+ copy %r21,%r20
+ b L$6
+ nop
+ b L$7
+ copy %r21,%r20
+
+L$loop
+L$0 ldd 8(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,0(%r26)
+L$7 ldd 16(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,8(%r26)
+L$6 ldd 24(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,16(%r26)
+L$5 ldd 32(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,24(%r26)
+L$4 ldd 40(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,32(%r26)
+L$3 ldd 48(%r25),%r21
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,40(%r26)
+L$2 ldd 56(%r25),%r20
+ shrpd %r20,%r21,%sar,%r21
+ std %r21,48(%r26)
+L$1 ldd 64(%r25),%r21
+ ldo 64(%r25),%r25
+ shrpd %r21,%r20,%sar,%r20
+ std %r20,56(%r26)
+ addib,> -8,%r24,L$loop
+ ldo 64(%r26),%r26
+
+L$end shrpd %r0,%r21,%sar,%r21
+ std %r21,0(%r26)
+ bve (%r2)
+ .exit
+ copy %r29,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64w/sub_n.s b/rts/gmp/mpn/pa64w/sub_n.s
new file mode 100644
index 0000000000..ad01e24aa7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr gr26
+; s1_ptr gr25
+; s2_ptr gr24
+; size gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+ .level 2.0w
+ .code
+ .export __gmpn_sub_n,entry
+__gmpn_sub_n
+ .proc
+ .callinfo frame=0,args_saved
+ .entry
+
+ sub %r0,%r23,%r22
+ depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7)
+ depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7)
+ sub %r25,%r22,%r25 ; offset s1_ptr
+ sub %r24,%r22,%r24 ; offset s2_ptr
+ blr %r28,%r0 ; branch into loop
+ sub %r26,%r22,%r26 ; offset res_ptr and set carry
+
+L$loop ldd 0(%r25),%r20
+ ldd 0(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,0(%r26)
+L$7 ldd 8(%r25),%r21
+ ldd 8(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,8(%r26)
+L$6 ldd 16(%r25),%r20
+ ldd 16(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,16(%r26)
+L$5 ldd 24(%r25),%r21
+ ldd 24(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,24(%r26)
+L$4 ldd 32(%r25),%r20
+ ldd 32(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,32(%r26)
+L$3 ldd 40(%r25),%r21
+ ldd 40(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,40(%r26)
+L$2 ldd 48(%r25),%r20
+ ldd 48(%r24),%r31
+ sub,db %r20,%r31,%r20
+ std %r20,48(%r26)
+L$1 ldd 56(%r25),%r21
+ ldo 64(%r25),%r25
+ ldd 56(%r24),%r19
+ sub,db %r21,%r19,%r21
+ std %r21,56(%r26)
+ ldo 64(%r24),%r24
+ addib,> -8,%r23,L$loop
+ ldo 64(%r26),%r26
+
+ add,dc %r0,%r0,%r29
+ subi 1,%r29,%r29
+ bve (%r2)
+ .exit
+ copy %r29,%r28
+ .procend
diff --git a/rts/gmp/mpn/pa64w/submul_1.S b/rts/gmp/mpn/pa64w/submul_1.S
new file mode 100644
index 0000000000..294f6239b2
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/submul_1.S
@@ -0,0 +1,171 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr %r26
+#define sptr %r25
+#define size %r24
+#define s2limb %r23
+
+; This runs at 11 cycles/limb on a PA8000. It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+ .level 2.0w
+ .code
+ .export __gmpn_submul_1,entry
+__gmpn_submul_1
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ std s2limb,-56(%r30)
+ fldd -56(%r30),%fr5
+ ldo 128(%r30),%r30
+ add %r0,%r0,cylimb ; clear cy and cylimb
+
+ std %r3,-96(%r30)
+ std %r4,-88(%r30)
+ std %r5,-80(%r30)
+ std %r6,-72(%r30)
+ depdi,z 1,31,1,%r5
+
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ addib,= -1,%r24,L$end1
+ nop
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ addib,= -1,%r24,L$end2
+ nop
+L$loop
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m1
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t4
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ fldd 0(sptr),%fr4
+ ldo 8(sptr),sptr
+ std t3,0(rptr)
+ addib,<> -1,%r24,L$loop
+ ldo 8(rptr),rptr
+L$end2
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ ldd -128(%r30),lo ; lo = low 64 bit of product
+ add cylimb,t4,t4
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ add,dc t2,hi,cylimb
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ ldd -104(%r30),hi ; hi = high 64 bit of product
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+L$end1
+ ldd 0(rptr),rlimb
+ extrd,u lo,31,32,t1 ; t1 = hi32(lo)
+ extrd,u lo,63,32,t4 ; t4 = lo32(lo)
+ add,l m0,t1,t1 ; t1 += m0
+ add,l,*nuv m1,t1,t1 ; t1 += m0
+ add,l %r5,hi,hi ; propagate carry
+ extrd,u t1,31,32,t2 ; t2 = hi32(t1)
+ depd,z t1,31,32,t5 ; t5 = lo32(t1)
+ add,l t5,t4,t4 ; t4 += lo32(t1)
+ add cylimb,t4,t4
+ add,dc t2,hi,cylimb
+ sub rlimb,t4,t3
+ add t4,t3,%r0
+ add,dc %r0,cylimb,cylimb
+ std t3,0(rptr)
+ ldo 8(rptr),rptr
+
+ ldd -96(%r30),%r3
+ ldd -88(%r30),%r4
+ ldd -80(%r30),%r5
+ ldd -72(%r30),%r6
+
+ copy cylimb,%r28
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
new file mode 100644
index 0000000000..1852913000
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
@@ -0,0 +1,117 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+#define TWO63 9223372036854775808.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+ mp_limb_t n1;
+ mp_limb_t n0;
+ mp_limb_t d;
+ mp_limb_t *r;
+#endif
+{
+ mp_limb_t q1, q2, q;
+ mp_limb_t p1, p0;
+ double di, dq;
+
+ di = 1.0 / d;
+
+ /* Generate upper 53 bits of quotient. Be careful here; the `double'
+ quotient may be rounded to 2^64 which we cannot safely convert back
+ to a 64-bit integer. */
+ dq = (TWO64 * (double) n1 + (double) n0) * di;
+ if (dq >= TWO64)
+ q1 = 0xfffffffffffff800L;
+#ifndef __GNUC__
+ /* Work around HP compiler bug. */
+ else if (dq > TWO63)
+ q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L;
+#endif
+ else
+ q1 = (mp_limb_t) dq;
+
+ /* Multiply back in order to compare the product to the dividend. */
+ umul_ppmm (p1, p0, q1, d);
+
+ /* Was the 53-bit quotient greater that our sought quotient? Test the
+ sign of the partial remainder to find out. */
+ if (n1 < p1 || (n1 == p1 && n0 < p0))
+ {
+ /* 53-bit quotient too large. Partial remainder is negative.
+ Compute the absolute value of the remainder in n1,,n0. */
+ n1 = p1 - (n1 + (p0 < n0));
+ n0 = p0 - n0;
+
+ /* Now use the partial remainder as new dividend to compute more bits of
+ quotient. This is an adjustment for the one we got previously. */
+ q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+ umul_ppmm (p1, p0, q2, d);
+
+ q = q1 - q2;
+ if (n1 < p1 || (n1 == p1 && n0 <= p0))
+ {
+ n0 = p0 - n0;
+ }
+ else
+ {
+ n0 = p0 - n0;
+ n0 += d;
+ q--;
+ }
+ }
+ else
+ {
+ n1 = n1 - (p1 + (n0 < p0));
+ n0 = n0 - p0;
+
+ q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+ umul_ppmm (p1, p0, q2, d);
+
+ q = q1 + q2;
+ if (n1 < p1 || (n1 == p1 && n0 < p0))
+ {
+ n0 = n0 - p0;
+ n0 += d;
+ q--;
+ }
+ else
+ {
+ n0 = n0 - p0;
+ if (n0 >= d)
+ {
+ n0 -= d;
+ q++;
+ }
+ }
+ }
+
+ *r = n0;
+ return q;
+}
diff --git a/rts/gmp/mpn/pa64w/umul_ppmm.S b/rts/gmp/mpn/pa64w/umul_ppmm.S
new file mode 100644
index 0000000000..d9fb92be8c
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/umul_ppmm.S
@@ -0,0 +1,72 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0 %r28
+#define p1 %r29
+#define t32 %r19
+#define t0 %r20
+#define t1 %r21
+#define x %r22
+#define m0 %r23
+#define m1 %r24
+ .level 2.0w
+ .code
+ .export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+ .proc
+ .callinfo frame=128,no_calls
+ .entry
+ ldo 128(%r30),%r30
+ std %r26,-64(%r30)
+ std %r25,-56(%r30)
+
+ copy %r24,%r31
+
+ fldd -64(%r30),%fr4
+ fldd -56(%r30),%fr5
+
+ xmpyu %fr5R,%fr4R,%fr6
+ fstd %fr6,-128(%r30)
+ xmpyu %fr5R,%fr4L,%fr7
+ fstd %fr7,-120(%r30)
+ xmpyu %fr5L,%fr4R,%fr8
+ fstd %fr8,-112(%r30)
+ xmpyu %fr5L,%fr4L,%fr9
+ fstd %fr9,-104(%r30)
+
+ depdi,z 1,31,1,t32 ; t32 = 2^32
+
+ ldd -128(%r30),p0 ; lo = low 64 bit of product
+ ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
+ ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
+ ldd -104(%r30),p1 ; hi = high 64 bit of product
+
+ add,l,*nuv m0,m1,x ; x = m1+m0
+ add,l t32,p1,p1 ; propagate carry to mid of p1
+ depd,z x,31,32,t0 ; lo32(m1+m0)
+ add t0,p0,p0
+ extrd,u x,31,32,t1 ; hi32(m1+m0)
+ add,dc t1,p1,p1
+
+ std p0,0(%r31) ; store low half of product
+ copy p1,%r28 ; return high half of product
+ bve (%r2)
+ .exit
+ ldo -128(%r30),%r30
+ .procend
diff --git a/rts/gmp/mpn/power/add_n.s b/rts/gmp/mpn/power/add_n.s
new file mode 100644
index 0000000000..0f9f48f1cc
--- /dev/null
+++ b/rts/gmp/mpn/power/add_n.s
@@ -0,0 +1,79 @@
+# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+
+ .toc
+ .globl __gmpn_add_n
+ .globl .__gmpn_add_n
+ .csect __gmpn_add_n[DS]
+__gmpn_add_n:
+ .long .__gmpn_add_n, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_add_n:
+ andil. 10,6,1 # odd or even number of limbs?
+ l 8,0(4) # load least significant s1 limb
+ l 0,0(5) # load least significant s2 limb
+ cal 3,-4(3) # offset res_ptr, it's updated before it's used
+ sri 10,6,1 # count for unrolled loop
+ a 7,0,8 # add least significant limbs, set cy
+ mtctr 10 # copy count into CTR
+ beq 0,Leven # branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs. Add the first limbs separately.
+ cmpi 1,10,0 # is count for unrolled loop zero?
+ bc 4,6,L1 # bne cr1,L1 (misassembled by gas)
+ st 7,4(3)
+ aze 3,10 # use the fact that r10 is zero...
+ br # return
+
+# We added least significant limbs. Now reload the next limbs to enter loop.
+L1: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ stu 7,4(3)
+ ae 7,0,8 # add limbs, set cy
+Leven: lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ bdz Lend # If done, skip loop
+
+Loop: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ ae 11,9,10 # add previous limbs with cy, set cy
+ stu 7,4(3) #
+ lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ ae 7,0,8 # add previous limbs with cy, set cy
+ stu 11,4(3) #
+ bdn Loop # decrement CTR and loop back
+
+Lend: ae 11,9,10 # add limbs with cy, set cy
+ st 7,4(3) #
+ st 11,8(3) #
+ lil 3,0 # load cy into ...
+ aze 3,3 # ... return value register
+ br
diff --git a/rts/gmp/mpn/power/addmul_1.s b/rts/gmp/mpn/power/addmul_1.s
new file mode 100644
index 0000000000..8ecc651579
--- /dev/null
+++ b/rts/gmp/mpn/power/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction. To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result. We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+ .toc
+ .globl __gmpn_addmul_1
+ .globl .__gmpn_addmul_1
+ .csect __gmpn_addmul_1[DS]
+__gmpn_addmul_1:
+ .long .__gmpn_addmul_1, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_addmul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 8
+ cax 9,9,7
+ l 7,4(3)
+ a 8,8,7 # add res_limb
+ blt Lneg
+Lpos: bdz Lend
+
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 8,0,9 # low limb + old_cy_limb + old cy
+ l 7,4(3)
+ aze 10,10 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 8,0,10
+ l 7,4(3)
+ aze 9,9
+ a 8,8,7
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 7
+ ae 8,7,9
+ l 7,4(3)
+ ae 10,10,0 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 7
+ ae 8,7,10
+ l 7,4(3)
+ ae 9,9,0 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
diff --git a/rts/gmp/mpn/power/lshift.s b/rts/gmp/mpn/power/lshift.s
new file mode 100644
index 0000000000..ab71fb7727
--- /dev/null
+++ b/rts/gmp/mpn/power/lshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __gmpn_lshift --
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s_ptr r4
+# size r5
+# cnt r6
+
+ .toc
+ .globl __gmpn_lshift
+ .globl .__gmpn_lshift
+ .csect __gmpn_lshift[DS]
+__gmpn_lshift:
+ .long .__gmpn_lshift, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_lshift:
+ sli 0,5,2
+ cax 9,3,0
+ cax 4,4,0
+ sfi 8,6,32
+ mtctr 5 # put limb count in CTR loop register
+ lu 0,-4(4) # read most significant limb
+ sre 3,0,8 # compute carry out limb, and init MQ register
+ bdz Lend2 # if just one limb, skip loop
+ lu 0,-4(4) # read 2:nd most significant limb
+ sreq 7,0,8 # compute most significant limb of result
+ bdz Lend # if just two limb, skip loop
+Loop: lu 0,-4(4) # load next lower limb
+ stu 7,-4(9) # store previous result during read latency
+ sreq 7,0,8 # compute result limb
+ bdn Loop # loop back until CTR is zero
+Lend: stu 7,-4(9) # store 2:nd least significant limb
+Lend2: sle 7,0,6 # compute least significant limb
+ st 7,-4(9) # store it" \
+ br
diff --git a/rts/gmp/mpn/power/mul_1.s b/rts/gmp/mpn/power/mul_1.s
new file mode 100644
index 0000000000..4e08ade583
--- /dev/null
+++ b/rts/gmp/mpn/power/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction. To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result. We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+ .toc
+ .globl __gmpn_mul_1
+ .globl .__gmpn_mul_1
+ .csect __gmpn_mul_1[DS]
+__gmpn_mul_1:
+ .long .__gmpn_mul_1, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_mul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 8
+ ai 0,0,0 # reset carry
+ cax 9,9,7
+ blt Lneg
+Lpos: bdz Lend
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 8,0,9
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 8,0,10
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ cax 10,10,0 # adjust high limb for negative s2_limb
+ mfmq 0
+ ae 8,0,9
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ cax 9,9,0 # adjust high limb for negative s2_limb
+ mfmq 0
+ ae 8,0,10
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
diff --git a/rts/gmp/mpn/power/rshift.s b/rts/gmp/mpn/power/rshift.s
new file mode 100644
index 0000000000..65b3945f8a
--- /dev/null
+++ b/rts/gmp/mpn/power/rshift.s
@@ -0,0 +1,54 @@
+# IBM POWER __gmpn_rshift --
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s_ptr r4
+# size r5
+# cnt r6
+
+ .toc
+ .globl __gmpn_rshift
+ .globl .__gmpn_rshift
+ .csect __gmpn_rshift[DS]
+__gmpn_rshift:
+ .long .__gmpn_rshift, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_rshift:
+ sfi 8,6,32
+ mtctr 5 # put limb count in CTR loop register
+ l 0,0(4) # read least significant limb
+ ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s
+ sle 3,0,8 # compute carry limb, and init MQ register
+ bdz Lend2 # if just one limb, skip loop
+ lu 0,4(4) # read 2:nd least significant limb
+ sleq 7,0,8 # compute least significant limb of result
+ bdz Lend # if just two limb, skip loop
+Loop: lu 0,4(4) # load next higher limb
+ stu 7,4(9) # store previous result during read latency
+ sleq 7,0,8 # compute result limb
+ bdn Loop # loop back until CTR is zero
+Lend: stu 7,4(9) # store 2:nd most significant limb
+Lend2: sre 7,0,6 # compute most significant limb
+ st 7,4(9) # store it" \
+ br
diff --git a/rts/gmp/mpn/power/sdiv.s b/rts/gmp/mpn/power/sdiv.s
new file mode 100644
index 0000000000..81da622fbc
--- /dev/null
+++ b/rts/gmp/mpn/power/sdiv.s
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+ .toc
+ .globl __sdiv_qrnnd
+ .globl .__sdiv_qrnnd
+ .csect __sdiv_qrnnd[DS]
+__sdiv_qrnnd:
+ .long .__sdiv_qrnnd, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__sdiv_qrnnd:
+ mtmq 5
+ div 0,4,6
+ mfmq 9
+ st 9,0(3)
+ mr 3,0
+ br
diff --git a/rts/gmp/mpn/power/sub_n.s b/rts/gmp/mpn/power/sub_n.s
new file mode 100644
index 0000000000..aa09cf5bc1
--- /dev/null
+++ b/rts/gmp/mpn/power/sub_n.s
@@ -0,0 +1,80 @@
+# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+
+ .toc
+ .globl __gmpn_sub_n
+ .globl .__gmpn_sub_n
+ .csect __gmpn_sub_n[DS]
+__gmpn_sub_n:
+ .long .__gmpn_sub_n, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_sub_n:
+ andil. 10,6,1 # odd or even number of limbs?
+ l 8,0(4) # load least significant s1 limb
+ l 0,0(5) # load least significant s2 limb
+ cal 3,-4(3) # offset res_ptr, it's updated before it's used
+ sri 10,6,1 # count for unrolled loop
+ sf 7,0,8 # subtract least significant limbs, set cy
+ mtctr 10 # copy count into CTR
+ beq 0,Leven # branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs. Add the first limbs separately.
+ cmpi 1,10,0 # is count for unrolled loop zero?
+ bc 4,6,L1 # bne cr1,L1 (misassembled by gas)
+ st 7,4(3)
+ sfe 3,0,0 # load !cy into ...
+ sfi 3,3,0 # ... return value register
+ br # return
+
+# We added least significant limbs. Now reload the next limbs to enter loop.
+L1: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ stu 7,4(3)
+ sfe 7,0,8 # subtract limbs, set cy
+Leven: lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ bdz Lend # If done, skip loop
+
+Loop: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ sfe 11,10,9 # subtract previous limbs with cy, set cy
+ stu 7,4(3) #
+ lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ sfe 7,0,8 # subtract previous limbs with cy, set cy
+ stu 11,4(3) #
+ bdn Loop # decrement CTR and loop back
+
+Lend: sfe 11,10,9 # subtract limbs with cy, set cy
+ st 7,4(3) #
+ st 11,8(3) #
+ sfe 3,0,0 # load !cy into ...
+ sfi 3,3,0 # ... return value register
+ br
diff --git a/rts/gmp/mpn/power/submul_1.s b/rts/gmp/mpn/power/submul_1.s
new file mode 100644
index 0000000000..bc01b7c95d
--- /dev/null
+++ b/rts/gmp/mpn/power/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction. To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result. We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+ .toc
+ .globl __gmpn_submul_1
+ .globl .__gmpn_submul_1
+ .csect __gmpn_submul_1[DS]
+__gmpn_submul_1:
+ .long .__gmpn_submul_1, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__gmpn_submul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 11
+ cax 9,9,7
+ l 7,4(3)
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ blt Lneg
+Lpos: bdz Lend
+
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 11,0,9 # low limb + old_cy_limb + old cy
+ l 7,4(3)
+ aze 10,10 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 11,0,10
+ l 7,4(3)
+ aze 9,9
+ sf 8,11,7
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 7
+ ae 11,7,9
+ l 7,4(3)
+ ae 10,10,0 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 7
+ ae 11,7,10
+ l 7,4(3)
+ ae 9,9,0 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
diff --git a/rts/gmp/mpn/power/umul.s b/rts/gmp/mpn/power/umul.s
new file mode 100644
index 0000000000..8c77496380
--- /dev/null
+++ b/rts/gmp/mpn/power/umul.s
@@ -0,0 +1,38 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+ .toc
+ .globl __umul_ppmm
+ .globl .__umul_ppmm
+ .csect __umul_ppmm[DS]
+__umul_ppmm:
+ .long .__umul_ppmm, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.__umul_ppmm:
+ mul 9,4,5
+ srai 0,4,31
+ and 0,0,5
+ srai 5,5,31
+ and 5,5,4
+ cax 0,0,5
+ mfmq 11
+ st 11,0(3)
+ cax 3,9,0
+ br
diff --git a/rts/gmp/mpn/powerpc32/add_n.asm b/rts/gmp/mpn/powerpc32/add_n.asm
new file mode 100644
index 0000000000..81ed04b162
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/add_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl s2_ptr r5
+dnl size r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ mtctr r6 C copy size into CTR
+ addic r0,r0,0 C clear cy
+ lwz r8,0(r4) C load least significant s1 limb
+ lwz r0,0(r5) C load least significant s2 limb
+ addi r3,r3,-4 C offset res_ptr, it's updated before it's used
+ bdz .Lend C If done, skip loop
+.Loop: lwz r9,4(r4) C load s1 limb
+ lwz r10,4(r5) C load s2 limb
+ adde r7,r0,r8 C add limbs with cy, set cy
+ stw r7,4(r3) C store result limb
+ bdz .Lexit C decrement CTR and exit if done
+ lwzu r8,8(r4) C load s1 limb and update s1_ptr
+ lwzu r0,8(r5) C load s2 limb and update s2_ptr
+ adde r7,r10,r9 C add limbs with cy, set cy
+ stwu r7,8(r3) C store result limb and update res_ptr
+ bdnz .Loop C decrement CTR and loop back
+
+.Lend: adde r7,r0,r8
+ stw r7,4(r3) C store ultimate result limb
+ li r3,0 C load cy into ...
+ addze r3,r3 C ... return value register
+ blr
+.Lexit: adde r7,r10,r9
+ stw r7,8(r3)
+ li r3,0 C load cy into ...
+ addze r3,r3 C ... return value register
+ blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc32/addmul_1.asm b/rts/gmp/mpn/powerpc32/addmul_1.asm
new file mode 100644
index 0000000000..3ef75b1532
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/addmul_1.asm
@@ -0,0 +1,124 @@
+dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl size r5
+dnl s2_limb r6
+
+dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ cmpi cr0,r5,9 C more than 9 limbs?
+ bgt cr0,.Lbig C branch if more than 9 limbs
+
+ mtctr r5
+ lwz r0,0(r4)
+ mullw r7,r0,r6
+ mulhwu r10,r0,r6
+ lwz r9,0(r3)
+ addc r8,r7,r9
+ addi r3,r3,-4
+ bdz .Lend
+.Lloop:
+ lwzu r0,4(r4)
+ stwu r8,4(r3)
+ mullw r8,r0,r6
+ adde r7,r8,r10
+ mulhwu r10,r0,r6
+ lwz r9,4(r3)
+ addze r10,r10
+ addc r8,r7,r9
+ bdnz .Lloop
+.Lend: stw r8,4(r3)
+ addze r3,r10
+ blr
+
+.Lbig: stmw r30,-32(r1)
+ addi r5,r5,-1
+ srwi r0,r5,2
+ mtctr r0
+
+ lwz r7,0(r4)
+ mullw r8,r7,r6
+ mulhwu r0,r7,r6
+ lwz r7,0(r3)
+ addc r8,r8,r7
+ stw r8,0(r3)
+
+.LloopU:
+ lwz r7,4(r4)
+ lwz r12,8(r4)
+ lwz r30,12(r4)
+ lwzu r31,16(r4)
+ mullw r8,r7,r6
+ mullw r9,r12,r6
+ mullw r10,r30,r6
+ mullw r11,r31,r6
+ adde r8,r8,r0 C add cy_limb
+ mulhwu r0,r7,r6
+ lwz r7,4(r3)
+ adde r9,r9,r0
+ mulhwu r0,r12,r6
+ lwz r12,8(r3)
+ adde r10,r10,r0
+ mulhwu r0,r30,r6
+ lwz r30,12(r3)
+ adde r11,r11,r0
+ mulhwu r0,r31,r6
+ lwz r31,16(r3)
+ addze r0,r0 C new cy_limb
+ addc r8,r8,r7
+ stw r8,4(r3)
+ adde r9,r9,r12
+ stw r9,8(r3)
+ adde r10,r10,r30
+ stw r10,12(r3)
+ adde r11,r11,r31
+ stwu r11,16(r3)
+ bdnz .LloopU
+
+ andi. r31,r5,3
+ mtctr r31
+ beq cr0,.Lendx
+
+.LloopE:
+ lwzu r7,4(r4)
+ mullw r8,r7,r6
+ adde r8,r8,r0 C add cy_limb
+ mulhwu r0,r7,r6
+ lwz r7,4(r3)
+ addze r0,r0 C new cy_limb
+ addc r8,r8,r7
+ stwu r8,4(r3)
+ bdnz .LloopE
+.Lendx:
+ addze r3,r0
+ lmw r30,-32(r1)
+ blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc32/aix.m4 b/rts/gmp/mpn/powerpc32/aix.m4
new file mode 100644
index 0000000000..2bd8425817
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/aix.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl m4 macros for AIX 32-bit assembly.
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+ `.toc')
+
+define(`PROLOGUE',
+ `
+ .globl $1
+ .globl .$1
+ .csect $1[DS],2
+$1:
+ .long .$1, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/gmp-mparam.h b/rts/gmp/mpn/powerpc32/gmp-mparam.h
new file mode 100644
index 0000000000..b283185789
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/gmp-mparam.h
@@ -0,0 +1,66 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 604. Presumably, these should be considerably
+ different for the 603 and 750 that have much slower multiply
+ instructions. */
+
+/* Generated by tuneup.c, 2000-05-26. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 26 /* tuneup says 20 */
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 228
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 46 /* tuneup says 44 */
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 262
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 52
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 86
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 23
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 7
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 53
+#endif
diff --git a/rts/gmp/mpn/powerpc32/lshift.asm b/rts/gmp/mpn/powerpc32/lshift.asm
new file mode 100644
index 0000000000..73a85430ab
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/lshift.asm
@@ -0,0 +1,145 @@
+dnl PowerPC-32 mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl size r5
+dnl cnt r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ cmpi cr0,r5,12 C more than 12 limbs?
+ slwi r0,r5,2
+ add r4,r4,r0 C make r4 point at end of s1
+ add r7,r3,r0 C make r7 point at end of res
+ bgt .LBIG C branch if more than 12 limbs
+
+ mtctr r5 C copy size into CTR
+ subfic r8,r6,32
+ lwzu r11,-4(r4) C load first s1 limb
+ srw r3,r11,r8 C compute function return value
+ bdz .Lend1
+
+.Loop: lwzu r10,-4(r4)
+ slw r9,r11,r6
+ srw r12,r10,r8
+ or r9,r9,r12
+ stwu r9,-4(r7)
+ bdz .Lend2
+ lwzu r11,-4(r4)
+ slw r9,r10,r6
+ srw r12,r11,r8
+ or r9,r9,r12
+ stwu r9,-4(r7)
+ bdnz .Loop
+
+.Lend1: slw r0,r11,r6
+ stw r0,-4(r7)
+ blr
+.Lend2: slw r0,r10,r6
+ stw r0,-4(r7)
+ blr
+
+.LBIG:
+ stmw r24,-32(r1) C save registers we are supposed to preserve
+ lwzu r9,-4(r4)
+ subfic r8,r6,32
+ srw r3,r9,r8 C compute function return value
+ slw r0,r9,r6
+ addi r5,r5,-1
+
+ andi. r10,r5,3 C count for spill loop
+ beq .Le
+ mtctr r10
+ lwzu r28,-4(r4)
+ bdz .Lxe0
+
+.Loop0: slw r12,r28,r6
+ srw r24,r28,r8
+ lwzu r28,-4(r4)
+ or r24,r0,r24
+ stwu r24,-4(r7)
+ mr r0,r12
+ bdnz .Loop0 C taken at most once!
+
+.Lxe0: slw r12,r28,r6
+ srw r24,r28,r8
+ or r24,r0,r24
+ stwu r24,-4(r7)
+ mr r0,r12
+
+.Le: srwi r5,r5,2 C count for unrolled loop
+ addi r5,r5,-1
+ mtctr r5
+ lwz r28,-4(r4)
+ lwz r29,-8(r4)
+ lwz r30,-12(r4)
+ lwzu r31,-16(r4)
+
+.LoopU: slw r9,r28,r6
+ srw r24,r28,r8
+ lwz r28,-4(r4)
+ slw r10,r29,r6
+ srw r25,r29,r8
+ lwz r29,-8(r4)
+ slw r11,r30,r6
+ srw r26,r30,r8
+ lwz r30,-12(r4)
+ slw r12,r31,r6
+ srw r27,r31,r8
+ lwzu r31,-16(r4)
+ or r24,r0,r24
+ stw r24,-4(r7)
+ or r25,r9,r25
+ stw r25,-8(r7)
+ or r26,r10,r26
+ stw r26,-12(r7)
+ or r27,r11,r27
+ stwu r27,-16(r7)
+ mr r0,r12
+ bdnz .LoopU
+
+ slw r9,r28,r6
+ srw r24,r28,r8
+ slw r10,r29,r6
+ srw r25,r29,r8
+ slw r11,r30,r6
+ srw r26,r30,r8
+ slw r12,r31,r6
+ srw r27,r31,r8
+ or r24,r0,r24
+ stw r24,-4(r7)
+ or r25,r9,r25
+ stw r25,-8(r7)
+ or r26,r10,r26
+ stw r26,-12(r7)
+ or r27,r11,r27
+ stwu r27,-16(r7)
+ mr r0,r12
+
+ stw r0,-4(r7)
+ lmw r24,-32(r1) C restore registers
+ blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc32/mul_1.asm b/rts/gmp/mpn/powerpc32/mul_1.asm
new file mode 100644
index 0000000000..ec878b54d5
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/mul_1.asm
@@ -0,0 +1,86 @@
+dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl size r5
+dnl s2_limb r6
+
+dnl This is optimized for the PPC604 but it runs decently even on PPC601. It
+dnl has not been tested on a PPC603 since I don't have access to any such
+dnl machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ mtctr r5
+ addi r3,r3,-4 C adjust res_ptr, it's offset before it's used
+ li r12,0 C clear upper product reg
+ addic r0,r0,0 C clear cy
+C Start software pipeline
+ lwz r8,0(r4)
+ bdz .Lend3
+ stmw r30,-8(r1) C save registers we are supposed to preserve
+ lwzu r9,4(r4)
+ mullw r11,r8,r6
+ mulhwu r0,r8,r6
+ bdz .Lend1
+C Software pipelined main loop
+.Loop: lwz r8,4(r4)
+ mullw r10,r9,r6
+ adde r30,r11,r12
+ mulhwu r12,r9,r6
+ stw r30,4(r3)
+ bdz .Lend2
+ lwzu r9,8(r4)
+ mullw r11,r8,r6
+ adde r31,r10,r0
+ mulhwu r0,r8,r6
+ stwu r31,8(r3)
+ bdnz .Loop
+C Finish software pipeline
+.Lend1: mullw r10,r9,r6
+ adde r30,r11,r12
+ mulhwu r12,r9,r6
+ stw r30,4(r3)
+ adde r31,r10,r0
+ stwu r31,8(r3)
+ addze r3,r12
+ lmw r30,-8(r1) C restore registers from stack
+ blr
+.Lend2: mullw r11,r8,r6
+ adde r31,r10,r0
+ mulhwu r0,r8,r6
+ stwu r31,8(r3)
+ adde r30,r11,r12
+ stw r30,4(r3)
+ addze r3,r0
+ lmw r30,-8(r1) C restore registers from stack
+ blr
+.Lend3: mullw r11,r8,r6
+ stw r11,4(r3)
+ mulhwu r3,r8,r6
+ blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc32/regmap.m4 b/rts/gmp/mpn/powerpc32/regmap.m4
new file mode 100644
index 0000000000..978f18902a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/regmap.m4
@@ -0,0 +1,34 @@
+divert(-1)
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Map register names r0, r1, etc, to just `0', `1', etc.
+dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X
+forloop(i,0,31,
+`define(`r'i,i)'
+)
+
+dnl Likewise for cr0, cr1, etc.
+forloop(i,0,7,
+`define(`cr'i,i)'
+)
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/rshift.asm b/rts/gmp/mpn/powerpc32/rshift.asm
new file mode 100644
index 0000000000..a09ba04938
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/rshift.asm
@@ -0,0 +1,60 @@
+dnl PowerPC-32 mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl size r5
+dnl cnt r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ mtctr r5 C copy size into CTR
+ addi r7,r3,-4 C move adjusted res_ptr to free return reg
+ subfic r8,r6,32
+ lwz r11,0(r4) C load first s1 limb
+ slw r3,r11,r8 C compute function return value
+ bdz .Lend1
+
+.Loop: lwzu r10,4(r4)
+ srw r9,r11,r6
+ slw r12,r10,r8
+ or r9,r9,r12
+ stwu r9,4(r7)
+ bdz .Lend2
+ lwzu r11,4(r4)
+ srw r9,r10,r6
+ slw r12,r11,r8
+ or r9,r9,r12
+ stwu r9,4(r7)
+ bdnz .Loop
+
+.Lend1: srw r0,r11,r6
+ stw r0,4(r7)
+ blr
+
+.Lend2: srw r0,r10,r6
+ stw r0,4(r7)
+ blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc32/sub_n.asm b/rts/gmp/mpn/powerpc32/sub_n.asm
new file mode 100644
index 0000000000..b04b4192ef
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/sub_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl s2_ptr r5
+dnl size r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ mtctr r6 C copy size into CTR
+ addic r0,r6,-1 C set cy
+ lwz r8,0(r4) C load least significant s1 limb
+ lwz r0,0(r5) C load least significant s2 limb
+ addi r3,r3,-4 C offset res_ptr, it's updated before it's used
+ bdz .Lend C If done, skip loop
+.Loop: lwz r9,4(r4) C load s1 limb
+ lwz r10,4(r5) C load s2 limb
+ subfe r7,r0,r8 C subtract limbs with cy, set cy
+ stw r7,4(r3) C store result limb
+ bdz .Lexit C decrement CTR and exit if done
+ lwzu r8,8(r4) C load s1 limb and update s1_ptr
+ lwzu r0,8(r5) C load s2 limb and update s2_ptr
+ subfe r7,r10,r9 C subtract limbs with cy, set cy
+ stwu r7,8(r3) C store result limb and update res_ptr
+ bdnz .Loop C decrement CTR and loop back
+
+.Lend: subfe r7,r0,r8
+ stw r7,4(r3) C store ultimate result limb
+ subfe r3,r0,r0 C load !cy into ...
+ subfic r3,r3,0 C ... return value register
+ blr
+.Lexit: subfe r7,r10,r9
+ stw r7,8(r3)
+ subfe r3,r0,r0 C load !cy into ...
+ subfic r3,r3,0 C ... return value register
+ blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc32/submul_1.asm b/rts/gmp/mpn/powerpc32/submul_1.asm
new file mode 100644
index 0000000000..a129e9f9ea
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/submul_1.asm
@@ -0,0 +1,130 @@
+dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr r3
+dnl s1_ptr r4
+dnl size r5
+dnl s2_limb r6
+
+dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ cmpi cr0,r5,9 C more than 9 limbs?
+ bgt cr0,.Lbig C branch if more than 9 limbs
+
+ mtctr r5
+ lwz r0,0(r4)
+ mullw r7,r0,r6
+ mulhwu r10,r0,r6
+ lwz r9,0(r3)
+ subfc r8,r7,r9
+ addc r7,r7,r8 C invert cy (r7 is junk)
+ addi r3,r3,-4
+ bdz .Lend
+.Lloop:
+ lwzu r0,4(r4)
+ stwu r8,4(r3)
+ mullw r8,r0,r6
+ adde r7,r8,r10
+ mulhwu r10,r0,r6
+ lwz r9,4(r3)
+ addze r10,r10
+ subfc r8,r7,r9
+ addc r7,r7,r8 C invert cy (r7 is junk)
+ bdnz .Lloop
+.Lend: stw r8,4(r3)
+ addze r3,r10
+ blr
+
+.Lbig: stmw r30,-32(r1)
+ addi r5,r5,-1
+ srwi r0,r5,2
+ mtctr r0
+
+ lwz r7,0(r4)
+ mullw r8,r7,r6
+ mulhwu r0,r7,r6
+ lwz r7,0(r3)
+ subfc r7,r8,r7
+ addc r8,r8,r7
+ stw r7,0(r3)
+
+.LloopU:
+ lwz r7,4(r4)
+ lwz r12,8(r4)
+ lwz r30,12(r4)
+ lwzu r31,16(r4)
+ mullw r8,r7,r6
+ mullw r9,r12,r6
+ mullw r10,r30,r6
+ mullw r11,r31,r6
+ adde r8,r8,r0 C add cy_limb
+ mulhwu r0,r7,r6
+ lwz r7,4(r3)
+ adde r9,r9,r0
+ mulhwu r0,r12,r6
+ lwz r12,8(r3)
+ adde r10,r10,r0
+ mulhwu r0,r30,r6
+ lwz r30,12(r3)
+ adde r11,r11,r0
+ mulhwu r0,r31,r6
+ lwz r31,16(r3)
+ addze r0,r0 C new cy_limb
+ subfc r7,r8,r7
+ stw r7,4(r3)
+ subfe r12,r9,r12
+ stw r12,8(r3)
+ subfe r30,r10,r30
+ stw r30,12(r3)
+ subfe r31,r11,r31
+ stwu r31,16(r3)
+ subfe r11,r11,r11 C invert ...
+ addic r11,r11,1 C ... carry
+ bdnz .LloopU
+
+ andi. r31,r5,3
+ mtctr r31
+ beq cr0,.Lendx
+
+.LloopE:
+ lwzu r7,4(r4)
+ mullw r8,r7,r6
+ adde r8,r8,r0 C add cy_limb
+ mulhwu r0,r7,r6
+ lwz r7,4(r3)
+ addze r0,r0 C new cy_limb
+ subfc r7,r8,r7
+ addc r8,r8,r7
+ stwu r7,4(r3)
+ bdnz .LloopE
+.Lendx:
+ addze r3,r0
+ lmw r30,-32(r1)
+ blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/powerpc32/umul.asm b/rts/gmp/mpn/powerpc32/umul.asm
new file mode 100644
index 0000000000..eeaa0a4dc8
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/umul.asm
@@ -0,0 +1,32 @@
+dnl PowerPC-32 umul_ppmm -- support for longlong.h
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+ mullw 0,4,5
+ mulhwu 9,4,5
+ stw 0,0(3)
+ mr 3,9
+ blr
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/powerpc64/README b/rts/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000000..c779276917
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/README
@@ -0,0 +1,36 @@
+PPC630 (aka Power3) pipeline information:
+
+Decoding is 4-way and issue is 8-way with some out-of-order capability.
+LS1 - ld/st unit 1
+LS2 - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instructions
+FXU2 - integer unit 2, handles any simple integer instructions
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory: Any two memory operations can issue, but memory subsystem
+ can sustain just one store per cycle.
+Simple integer: 2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+ on 2nd operand most significant bit position (10 bits per
+ cycle). Multiply unit is not pipelined, only one multiply
+ operation in progress is allowed.
+Integer divide: ?
+Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd)
+ Latency = 4.
+Floating-point divide:
+ ?
+Floating-point square root:
+ ?
+
+Best possible times for the main loops:
+shift: 1.5 cycles limited by integer unit contention.
+ With 63 special loops, one for each shift count, we could
+ reduce the needed integer instructions to 2, which would
+ reduce the best possible time to 1 cycle.
+add/sub: 1.5 cycles, limited by ld/st unit contention.
+mul: 18 cycles (average) unless floating-point operations are used,
+ but that would only help for multiplies of perhaps 10 and more
+ limbs.
+addmul/submul:Same situation as for mul.
diff --git a/rts/gmp/mpn/powerpc64/add_n.asm b/rts/gmp/mpn/powerpc64/add_n.asm
new file mode 100644
index 0000000000..c3325376dc
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/add_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and
+# store sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ mtctr r6 # copy size into CTR
+ addic r0,r0,0 # clear cy
+ ld r8,0(r4) # load least significant s1 limb
+ ld r0,0(r5) # load least significant s2 limb
+ addi r3,r3,-8 # offset res_ptr, it's updated before it's used
+ bdz .Lend # If done, skip loop
+.Loop: ld r9,8(r4) # load s1 limb
+ ld r10,8(r5) # load s2 limb
+ adde r7,r0,r8 # add limbs with cy, set cy
+ std r7,8(r3) # store result limb
+ bdz .Lexit # decrement CTR and exit if done
+ ldu r8,16(r4) # load s1 limb and update s1_ptr
+ ldu r0,16(r5) # load s2 limb and update s2_ptr
+ adde r7,r10,r9 # add limbs with cy, set cy
+ stdu r7,16(r3) # store result limb and update res_ptr
+ bdnz .Loop # decrement CTR and loop back
+
+.Lend: adde r7,r0,r8
+ std r7,8(r3) # store ultimate result limb
+ li r3,0 # load cy into ...
+ addze r3,r3 # ... return value register
+ blr
+.Lexit: adde r7,r10,r9
+ std r7,16(r3)
+ li r3,0 # load cy into ...
+ addze r3,r3 # ... return value register
+ blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc64/addmul_1.asm b/rts/gmp/mpn/powerpc64/addmul_1.asm
new file mode 100644
index 0000000000..81774482fe
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addmul_1.asm
@@ -0,0 +1,52 @@
+# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ mtctr 5
+ li 9,0 # cy_limb = 0
+ addic 0,0,0
+ cal 3,-8(3)
+ cal 4,-8(4)
+.Loop:
+ ldu 0,8(4)
+ ld 10,8(3)
+ mulld 7,0,6
+ adde 7,7,9
+ mulhdu 9,0,6
+ addze 9,9
+ addc 7,7,10
+ stdu 7,8(3)
+ bdnz .Loop
+
+ addze 3,9
+ blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc64/addsub_n.asm b/rts/gmp/mpn/powerpc64/addsub_n.asm
new file mode 100644
index 0000000000..4ed40d71ae
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addsub_n.asm
@@ -0,0 +1,107 @@
+# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+
+include(`asm-syntax.m4')
+
+define(SAVE_BORROW_RESTORE_CARRY,
+ `sldi $1,$1,63
+ adde $1,$1,$1')
+define(SAVE_CARRY_RESTORE_BORROW,
+ `sldi $1,$1,63
+ adde $1,$1,$1')
+
+# 19991117
+
+# This is just crafted for testing some ideas, and verifying that we can make
+# it run fast. It runs at 2.55 cycles/limb on the 630, which is very good.
+# We should play a little with the schedule. No time has been spent on that.
+
+# To finish this, the loop warm up and cool down code needs to be written,
+# and the result need to be tested. Also, the proper calling sequence should
+# be used.
+
+# r1p r2p s1p s2p n
+# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12
+
+ASM_START()
+PROLOGUE(mpn_addsub_n)
+ std r14,-64(1)
+ std r15,-56(1)
+ std r16,-48(1)
+ std r17,-40(1)
+ std r18,-32(1)
+ std r19,-24(1)
+
+ srdi r7,r7,2
+ mtctr r7 # copy size into CTR
+ addic r0,r0,0 # clear cy
+ addi r3,r3,-8 # offset res_ptr, it's updated before it's used
+ addi r4,r4,-8 # offset res_ptr, it's updated before it's used
+
+.Loop:
+ adde r12,r8,r9
+ std r12,8(r3)
+ adde r12,r10,r11
+ std r12,16(r3)
+
+ SAVE_CARRY_RESTORE_BORROW(r0)
+
+ subfe r12,r8,r9
+ std r12,8(r4)
+ ld r8,8(r5) # s1 L 1
+ ld r9,8(r6) # s2 L 1
+ subfe r12,r10,r11
+ std r12,16(r4)
+ ld r10,16(r5) # s1 L 2
+ ld r11,16(r6) # s2 L 2
+# pair -------------------------
+ subfe r12,r14,r15
+ std r12,24(r4)
+ subfe r12,r16,r17
+ stdu r12,32(r4)
+
+ SAVE_BORROW_RESTORE_CARRY(r0)
+
+ adde r12,r14,r15
+ std r12,24(r3)
+ ld r14,24(r5) # s1 L 3
+ ld r15,24(r6) # s2 L 3
+ adde r12,r16,r17
+ stdu r12,32(r3)
+ ldu r16,32(r5) # s1 L 4
+ ldu r17,32(r6) # s2 L 4
+ bdnz .Loop
+
+ ld r14,-64(1)
+ ld r15,-56(1)
+ ld r16,-48(1)
+ ld r17,-40(1)
+ ld r18,-32(1)
+ ld r19,-24(1)
+ blr
+EPILOGUE(mpn_addsub_n)
diff --git a/rts/gmp/mpn/powerpc64/aix.m4 b/rts/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000000..aee9f1f97a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/aix.m4
@@ -0,0 +1,40 @@
+divert(-1)
+dnl m4 macros for AIX 64-bit assembly.
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+ `.machine "ppc64"
+ .toc')
+
+define(`PROLOGUE',
+ `
+ .globl $1
+ .globl .$1
+ .csect $1[DS],3
+$1:
+ .llong .$1, TOC[tc0], 0
+ .csect .text[PR]
+ .align 2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc64/copyd.asm b/rts/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000000..d06e8c25fd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyd.asm
@@ -0,0 +1,45 @@
+# PowerPC-64 mpn_copyd -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr r3
+# sptr r4
+# n r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ cmpdi cr0,r5,0
+ mtctr r5
+ sldi r5,r5,3
+ add r4,r4,r5
+ add r3,r3,r5
+ beq cr0,.Lend
+.Loop: ldu r0,-8(r4)
+ stdu r0,-8(r3)
+ bdnz .Loop
+.Lend: blr
+EPILOGUE(mpn_copyd)
diff --git a/rts/gmp/mpn/powerpc64/copyi.asm b/rts/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000000..a1bedc4c5b
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyi.asm
@@ -0,0 +1,44 @@
+# PowerPC-64 mpn_copyi -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr r3
+# sptr r4
+# n r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ cmpdi cr0,r5,0
+ mtctr r5
+ addi r4,r4,-8
+ addi r3,r3,-8
+ beq cr0,.Lend
+.Loop: ldu r0,8(r4)
+ stdu r0,8(r3)
+ bdnz .Loop
+.Lend: blr
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/powerpc64/gmp-mparam.h b/rts/gmp/mpn/powerpc64/gmp-mparam.h
new file mode 100644
index 0000000000..6fefb960cd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-07-16. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 10
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 57
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 16
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 28
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 216
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 14
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 6
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 163
+#endif
diff --git a/rts/gmp/mpn/powerpc64/lshift.asm b/rts/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000000..cef3a81fdd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/lshift.asm
@@ -0,0 +1,159 @@
+# PowerPC-64 mpn_lshift -- Shift a number left.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# cnt r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ cmpdi cr0,r5,20 # more than 20 limbs?
+ sldi r0,r5,3
+ add r4,r4,r0 # make r4 point at end of s1
+ add r7,r3,r0 # make r7 point at end of res
+ bgt .LBIG # branch if more than 12 limbs
+
+ mtctr r5 # copy size into CTR
+ subfic r8,r6,64
+ ldu r11,-8(r4) # load first s1 limb
+ srd r3,r11,r8 # compute function return value
+ bdz .Lend1
+
+.Loop: ldu r10,-8(r4)
+ sld r9,r11,r6
+ srd r12,r10,r8
+ or r9,r9,r12
+ stdu r9,-8(r7)
+ bdz .Lend2
+ ldu r11,-8(r4)
+ sld r9,r10,r6
+ srd r12,r11,r8
+ or r9,r9,r12
+ stdu r9,-8(r7)
+ bdnz .Loop
+
+.Lend1: sld r0,r11,r6
+ std r0,-8(r7)
+ blr
+.Lend2: sld r0,r10,r6
+ std r0,-8(r7)
+ blr
+
+.LBIG:
+ std r24,-64(1)
+ std r25,-56(1)
+ std r26,-48(1)
+ std r27,-40(1)
+ std r28,-32(1)
+ std r29,-24(1)
+ std r30,-16(1)
+ std r31,-8(1)
+ ldu r9,-8(r4)
+ subfic r8,r6,64
+ srd r3,r9,r8 # compute function return value
+ sld r0,r9,r6
+ addi r5,r5,-1
+
+ andi. r10,r5,3 # count for spill loop
+ beq .Le
+ mtctr r10
+ ldu r28,-8(r4)
+ bdz .Lxe0
+
+.Loop0: sld r12,r28,r6
+ srd r24,r28,r8
+ ldu r28,-8(r4)
+ or r24,r0,r24
+ stdu r24,-8(r7)
+ mr r0,r12
+ bdnz .Loop0 # taken at most once!
+
+.Lxe0: sld r12,r28,r6
+ srd r24,r28,r8
+ or r24,r0,r24
+ stdu r24,-8(r7)
+ mr r0,r12
+
+.Le: srdi r5,r5,2 # count for unrolled loop
+ addi r5,r5,-1
+ mtctr r5
+ ld r28,-8(r4)
+ ld r29,-16(r4)
+ ld r30,-24(r4)
+ ldu r31,-32(r4)
+
+.LoopU: sld r9,r28,r6
+ srd r24,r28,r8
+ ld r28,-8(r4)
+ sld r10,r29,r6
+ srd r25,r29,r8
+ ld r29,-16(r4)
+ sld r11,r30,r6
+ srd r26,r30,r8
+ ld r30,-24(r4)
+ sld r12,r31,r6
+ srd r27,r31,r8
+ ldu r31,-32(r4)
+ or r24,r0,r24
+ std r24,-8(r7)
+ or r25,r9,r25
+ std r25,-16(r7)
+ or r26,r10,r26
+ std r26,-24(r7)
+ or r27,r11,r27
+ stdu r27,-32(r7)
+ mr r0,r12
+ bdnz .LoopU
+
+ sld r9,r28,r6
+ srd r24,r28,r8
+ sld r10,r29,r6
+ srd r25,r29,r8
+ sld r11,r30,r6
+ srd r26,r30,r8
+ sld r12,r31,r6
+ srd r27,r31,r8
+ or r24,r0,r24
+ std r24,-8(r7)
+ or r25,r9,r25
+ std r25,-16(r7)
+ or r26,r10,r26
+ std r26,-24(r7)
+ or r27,r11,r27
+ stdu r27,-32(r7)
+ mr r0,r12
+
+ std r0,-8(r7)
+ ld r24,-64(1)
+ ld r25,-56(1)
+ ld r26,-48(1)
+ ld r27,-40(1)
+ ld r28,-32(1)
+ ld r29,-24(1)
+ ld r30,-16(1)
+ ld r31,-8(1)
+ blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc64/mul_1.asm b/rts/gmp/mpn/powerpc64/mul_1.asm
new file mode 100644
index 0000000000..47597283ff
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/mul_1.asm
@@ -0,0 +1,49 @@
+# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ mtctr 5
+ li 9,0 # cy_limb = 0
+ addic 0,0,0
+ cal 3,-8(3)
+ cal 4,-8(4)
+.Loop:
+ ldu 0,8(4)
+ mulld 7,0,6
+ adde 7,7,9
+ mulhdu 9,0,6
+ stdu 7,8(3)
+ bdnz .Loop
+
+ addze 3,9
+ blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc64/rshift.asm b/rts/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000000..88272c7fa9
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/rshift.asm
@@ -0,0 +1,60 @@
+# PowerPC-64 mpn_rshift -- Shift a number right.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# cnt r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ mtctr r5 # copy size into CTR
+ addi r7,r3,-8 # move adjusted res_ptr to free return reg
+ subfic r8,r6,64
+ ld r11,0(r4) # load first s1 limb
+ sld r3,r11,r8 # compute function return value
+ bdz .Lend1
+
+.Loop: ldu r10,8(r4)
+ srd r9,r11,r6
+ sld r12,r10,r8
+ or r9,r9,r12
+ stdu r9,8(r7)
+ bdz .Lend2
+ ldu r11,8(r4)
+ srd r9,r10,r6
+ sld r12,r11,r8
+ or r9,r9,r12
+ stdu r9,8(r7)
+ bdnz .Loop
+
+.Lend1: srd r0,r11,r6
+ std r0,8(r7)
+ blr
+
+.Lend2: srd r0,r10,r6
+ std r0,8(r7)
+ blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc64/sub_n.asm b/rts/gmp/mpn/powerpc64/sub_n.asm
new file mode 100644
index 0000000000..4de3de69c7
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/sub_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+# and store difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ mtctr r6 # copy size into CTR
+ addic r0,r6,-1 # set cy
+ ld r8,0(r4) # load least significant s1 limb
+ ld r0,0(r5) # load least significant s2 limb
+ addi r3,r3,-8 # offset res_ptr, it's updated before it's used
+ bdz .Lend # If done, skip loop
+.Loop: ld r9,8(r4) # load s1 limb
+ ld r10,8(r5) # load s2 limb
+ subfe r7,r0,r8 # subtract limbs with cy, set cy
+ std r7,8(r3) # store result limb
+ bdz .Lexit # decrement CTR and exit if done
+ ldu r8,16(r4) # load s1 limb and update s1_ptr
+ ldu r0,16(r5) # load s2 limb and update s2_ptr
+ subfe r7,r10,r9 # subtract limbs with cy, set cy
+ stdu r7,16(r3) # store result limb and update res_ptr
+ bdnz .Loop # decrement CTR and loop back
+
+.Lend: subfe r7,r0,r8
+ std r7,8(r3) # store ultimate result limb
+ subfe r3,r0,r0 # load !cy into ...
+ subfic r3,r3,0 # ... return value register
+ blr
+.Lexit: subfe r7,r10,r9
+ std r7,16(r3)
+ subfe r3,r0,r0 # load !cy into ...
+ subfic r3,r3,0 # ... return value register
+ blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc64/submul_1.asm b/rts/gmp/mpn/powerpc64/submul_1.asm
new file mode 100644
index 0000000000..17f6369a38
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/submul_1.asm
@@ -0,0 +1,54 @@
+# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ mtctr 5
+ li 9,0 # cy_limb = 0
+ addic 0,0,0
+ cal 3,-8(3)
+ cal 4,-8(4)
+.Loop:
+ ldu 0,8(4)
+ ld 10,8(3)
+ mulld 7,0,6
+ adde 7,7,9
+ mulhdu 9,0,6
+ addze 9,9
+ subfc 7,7,10
+ stdu 7,8(3)
+ subfe 11,11,11 # invert ...
+ addic 11,11,1 # ... carry
+ bdnz .Loop
+
+ addze 3,9
+ blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/pyr/add_n.s b/rts/gmp/mpn/pyr/add_n.s
new file mode 100644
index 0000000000..e1fc535846
--- /dev/null
+++ b/rts/gmp/mpn/pyr/add_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 2
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ movw $-1,tr0 # representation for carry clear
+
+ movw pr3,tr2
+ andw $3,tr2
+ beq Lend0
+ subw tr2,pr3
+
+Loop0: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ addwc (pr2),tr1
+ movw tr1,(pr0)
+
+ subwb tr0,tr0
+ addw $4,pr0
+ addw $4,pr1
+ addw $4,pr2
+ addw $-1,tr2
+ bne Loop0
+
+ mtstw pr3,pr3
+ beq Lend
+Lend0:
+Loop: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ addwc (pr2),tr1
+ movw tr1,(pr0)
+
+ movw 4(pr1),tr1
+ addwc 4(pr2),tr1
+ movw tr1,4(pr0)
+
+ movw 8(pr1),tr1
+ addwc 8(pr2),tr1
+ movw tr1,8(pr0)
+
+ movw 12(pr1),tr1
+ addwc 12(pr2),tr1
+ movw tr1,12(pr0)
+
+ subwb tr0,tr0
+ addw $16,pr0
+ addw $16,pr1
+ addw $16,pr2
+ addw $-4,pr3
+ bne Loop
+Lend:
+ mnegw tr0,pr0
+ ret
diff --git a/rts/gmp/mpn/pyr/addmul_1.s b/rts/gmp/mpn/pyr/addmul_1.s
new file mode 100644
index 0000000000..65c3f8f008
--- /dev/null
+++ b/rts/gmp/mpn/pyr/addmul_1.s
@@ -0,0 +1,45 @@
+# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 2
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ mova (pr0)[pr2*4],pr0
+ mova (pr1)[pr2*4],pr1
+ mnegw pr2,pr2
+ movw $0,tr3
+
+Loop: movw (pr1)[pr2*4],tr1
+ uemul pr3,tr0
+ addw tr3,tr1
+ movw $0,tr3
+ addwc tr0,tr3
+ movw (pr0)[pr2*0x4],tr0
+ addw tr0,tr1
+ addwc $0,tr3
+ movw tr1,(pr0)[pr2*4]
+ addw $1,pr2
+ bne Loop
+
+ movw tr3,pr0
+ ret
diff --git a/rts/gmp/mpn/pyr/mul_1.s b/rts/gmp/mpn/pyr/mul_1.s
new file mode 100644
index 0000000000..1272297c42
--- /dev/null
+++ b/rts/gmp/mpn/pyr/mul_1.s
@@ -0,0 +1,42 @@
+# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 2
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ mova (pr0)[pr2*4],pr0
+ mova (pr1)[pr2*4],pr1
+ mnegw pr2,pr2
+ movw $0,tr3
+
+Loop: movw (pr1)[pr2*4],tr1
+ uemul pr3,tr0
+ addw tr3,tr1
+ movw $0,tr3
+ addwc tr0,tr3
+ movw tr1,(pr0)[pr2*4]
+ addw $1,pr2
+ bne Loop
+
+ movw tr3,pr0
+ ret
diff --git a/rts/gmp/mpn/pyr/sub_n.s b/rts/gmp/mpn/pyr/sub_n.s
new file mode 100644
index 0000000000..1fd2eb0f17
--- /dev/null
+++ b/rts/gmp/mpn/pyr/sub_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+ .align 2
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ movw $-1,tr0 # representation for carry clear
+
+ movw pr3,tr2
+ andw $3,tr2
+ beq Lend0
+ subw tr2,pr3
+
+Loop0: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ subwb (pr2),tr1
+ movw tr1,(pr0)
+
+ subwb tr0,tr0
+ addw $4,pr0
+ addw $4,pr1
+ addw $4,pr2
+ addw $-1,tr2
+ bne Loop0
+
+ mtstw pr3,pr3
+ beq Lend
+Lend0:
+Loop: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ subwb (pr2),tr1
+ movw tr1,(pr0)
+
+ movw 4(pr1),tr1
+ subwb 4(pr2),tr1
+ movw tr1,4(pr0)
+
+ movw 8(pr1),tr1
+ subwb 8(pr2),tr1
+ movw tr1,8(pr0)
+
+ movw 12(pr1),tr1
+ subwb 12(pr2),tr1
+ movw tr1,12(pr0)
+
+ subwb tr0,tr0
+ addw $16,pr0
+ addw $16,pr1
+ addw $16,pr2
+ addw $-4,pr3
+ bne Loop
+Lend:
+ mnegw tr0,pr0
+ ret
diff --git a/rts/gmp/mpn/sh/add_n.s b/rts/gmp/mpn/sh/add_n.s
new file mode 100644
index 0000000000..df388b31a3
--- /dev/null
+++ b/rts/gmp/mpn/sh/add_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! s2_ptr r6
+! size r7
+
+ .text
+ .align 2
+ .global ___gmpn_add_n
+___gmpn_add_n:
+ mov #0,r3 ! clear cy save reg
+
+Loop: mov.l @r5+,r1
+ mov.l @r6+,r2
+ shlr r3 ! restore cy
+ addc r2,r1
+ movt r3 ! save cy
+ mov.l r1,@r4
+ dt r7
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r3,r0 ! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sh/sh2/addmul_1.s b/rts/gmp/mpn/sh/sh2/addmul_1.s
new file mode 100644
index 0000000000..f34a7f0503
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/addmul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+! the result to a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_addmul_1
+___gmpn_addmul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1 ! lo_prod += old cy_limb
+ sts mach,r2 ! new cy_limb = hi_prod
+ mov.l @r4,r3
+ addc r0,r2 ! cy_limb += T, T = 0
+ addc r3,r1
+ addc r0,r2 ! cy_limb += T, T = 0
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/mul_1.s b/rts/gmp/mpn/sh/sh2/mul_1.s
new file mode 100644
index 0000000000..2a117a3175
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/mul_1.s
@@ -0,0 +1,50 @@
+! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_mul_1
+___gmpn_mul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1
+ sts mach,r2
+ addc r0,r2 ! propagate carry to cy_limb (dt clobbers T)
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/submul_1.s b/rts/gmp/mpn/sh/sh2/submul_1.s
new file mode 100644
index 0000000000..eb9a27dde3
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/submul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+! the result from a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_submul_1
+___gmpn_submul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1 ! lo_prod += old cy_limb
+ sts mach,r2 ! new cy_limb = hi_prod
+ mov.l @r4,r3
+ addc r0,r2 ! cy_limb += T, T = 0
+ subc r3,r1
+ addc r0,r2 ! cy_limb += T, T = 0
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/rts/gmp/mpn/sh/sub_n.s b/rts/gmp/mpn/sh/sub_n.s
new file mode 100644
index 0000000000..5f818c95a8
--- /dev/null
+++ b/rts/gmp/mpn/sh/sub_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+! difference in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! s2_ptr r6
+! size r7
+
+ .text
+ .align 2
+ .global ___gmpn_sub_n
+___gmpn_sub_n:
+ mov #0,r3 ! clear cy save reg
+
+Loop: mov.l @r5+,r1
+ mov.l @r6+,r2
+ shlr r3 ! restore cy
+ subc r2,r1
+ movt r3 ! save cy
+ mov.l r1,@r4
+ dt r7
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r3,r0 ! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sparc32/README b/rts/gmp/mpn/sparc32/README
new file mode 100644
index 0000000000..7c19df7bc4
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/README
@@ -0,0 +1,36 @@
+This directory contains mpn functions for various SPARC chips. Code that
+runs only on version 8 SPARC implementations, is in the v8 subdirectory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+ Load and Store timing
+
+On most early SPARC implementations, the ST instructions takes multiple
+cycles, while a STD takes just a single cycle more than an ST. For the CPUs
+in SPARCstation I and II, the times are 3 and 4 cycles, respectively.
+Therefore, combining two ST instrucitons into a STD when possible is a
+significant optimiation.
+
+Later SPARC implementations have single cycle ST.
+
+For SuperSPARC, we can perform just one memory instruction per cycle, even
+if up to two integer instructions can be executed in its pipeline. For
+programs that perform so many memory operations that there are not enough
+non-memory operations to issue in parallel with all memory operations, using
+LDD and STD when possible helps.
+
+STATUS
+
+1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5
+ cycles/limb asymptotically. We could optimize speed for special counts
+ by using ADDXCC.
+
+2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2
+ cycles/limb asymptotically.
+
+3. mpn_mul_1 runs at what is believed to be optimal speed.
+
+4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a
+ cycle by avoiding one of the add instrucitons. See a29k/addmul_1.
+
+The speed of the code for other SPARC implementations is uncertain.
diff --git a/rts/gmp/mpn/sparc32/add_n.asm b/rts/gmp/mpn/sparc32/add_n.asm
new file mode 100644
index 0000000000..5f1d00c0e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/add_n.asm
@@ -0,0 +1,236 @@
+dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ xor s2_ptr,res_ptr,%g1
+ andcc %g1,4,%g0
+ bne L(1) C branch if alignment differs
+ nop
+C ** V1a **
+L(0): andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0
+ be L(v1) C if no, branch
+ nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+ ld [s1_ptr],%g4
+ add s1_ptr,4,s1_ptr
+ ld [s2_ptr],%g2
+ add s2_ptr,4,s2_ptr
+ add n,-1,n
+ addcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+ add res_ptr,4,res_ptr
+L(v1): addx %g0,%g0,%o4 C save cy in register
+ cmp n,2 C if n < 2 ...
+ bl L(end2) C ... branch to tail code
+ subcc %g0,%o4,%g0 C restore cy
+
+ ld [s1_ptr+0],%g4
+ addcc n,-10,n
+ ld [s1_ptr+4],%g1
+ ldd [s2_ptr+0],%g2
+ blt L(fin1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+ addxcc %g4,%g2,%o4
+ ld [s1_ptr+8],%g4
+ addxcc %g1,%g3,%o5
+ ld [s1_ptr+12],%g1
+ ldd [s2_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ addxcc %g4,%g2,%o4
+ ld [s1_ptr+16],%g4
+ addxcc %g1,%g3,%o5
+ ld [s1_ptr+20],%g1
+ ldd [s2_ptr+16],%g2
+ std %o4,[res_ptr+8]
+ addxcc %g4,%g2,%o4
+ ld [s1_ptr+24],%g4
+ addxcc %g1,%g3,%o5
+ ld [s1_ptr+28],%g1
+ ldd [s2_ptr+24],%g2
+ std %o4,[res_ptr+16]
+ addxcc %g4,%g2,%o4
+ ld [s1_ptr+32],%g4
+ addxcc %g1,%g3,%o5
+ ld [s1_ptr+36],%g1
+ ldd [s2_ptr+32],%g2
+ std %o4,[res_ptr+24]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ add s1_ptr,32,s1_ptr
+ add s2_ptr,32,s2_ptr
+ add res_ptr,32,res_ptr
+ bge L(loop1)
+ subcc %g0,%o4,%g0 C restore cy
+
+L(fin1):
+ addcc n,8-2,n
+ blt L(end1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+ addxcc %g4,%g2,%o4
+ ld [s1_ptr+8],%g4
+ addxcc %g1,%g3,%o5
+ ld [s1_ptr+12],%g1
+ ldd [s2_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-2,n
+ add s1_ptr,8,s1_ptr
+ add s2_ptr,8,s2_ptr
+ add res_ptr,8,res_ptr
+ bge L(loope1)
+ subcc %g0,%o4,%g0 C restore cy
+L(end1):
+ addxcc %g4,%g2,%o4
+ addxcc %g1,%g3,%o5
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+
+ andcc n,1,%g0
+ be L(ret1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add last limb
+ ld [s1_ptr+8],%g4
+ ld [s2_ptr+8],%g2
+ addxcc %g4,%g2,%o4
+ st %o4,[res_ptr+8]
+
+L(ret1):
+ retl
+ addx %g0,%g0,%o0 C return carry-out from most sign. limb
+
+L(1): xor s1_ptr,res_ptr,%g1
+ andcc %g1,4,%g0
+ bne L(2)
+ nop
+C ** V1b **
+ mov s2_ptr,%g1
+ mov s1_ptr,s2_ptr
+ b L(0)
+ mov %g1,s1_ptr
+
+C ** V2 **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ. Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2): cmp n,1
+ be L(jone)
+ nop
+ andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0
+ be L(v2) C if no, branch
+ nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+ ld [s1_ptr],%g4
+ add s1_ptr,4,s1_ptr
+ ld [s2_ptr],%g2
+ add s2_ptr,4,s2_ptr
+ add n,-1,n
+ addcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+ add res_ptr,4,res_ptr
+
+L(v2): addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ blt L(fin2)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+ ldd [s1_ptr+0],%g2
+ ldd [s2_ptr+0],%o4
+ addxcc %g2,%o4,%g2
+ st %g2,[res_ptr+0]
+ addxcc %g3,%o5,%g3
+ st %g3,[res_ptr+4]
+ ldd [s1_ptr+8],%g2
+ ldd [s2_ptr+8],%o4
+ addxcc %g2,%o4,%g2
+ st %g2,[res_ptr+8]
+ addxcc %g3,%o5,%g3
+ st %g3,[res_ptr+12]
+ ldd [s1_ptr+16],%g2
+ ldd [s2_ptr+16],%o4
+ addxcc %g2,%o4,%g2
+ st %g2,[res_ptr+16]
+ addxcc %g3,%o5,%g3
+ st %g3,[res_ptr+20]
+ ldd [s1_ptr+24],%g2
+ ldd [s2_ptr+24],%o4
+ addxcc %g2,%o4,%g2
+ st %g2,[res_ptr+24]
+ addxcc %g3,%o5,%g3
+ st %g3,[res_ptr+28]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ add s1_ptr,32,s1_ptr
+ add s2_ptr,32,s2_ptr
+ add res_ptr,32,res_ptr
+ bge L(loop2)
+ subcc %g0,%o4,%g0 C restore cy
+
+L(fin2):
+ addcc n,8-2,n
+ blt L(end2)
+ subcc %g0,%o4,%g0 C restore cy
+L(loope2):
+ ldd [s1_ptr+0],%g2
+ ldd [s2_ptr+0],%o4
+ addxcc %g2,%o4,%g2
+ st %g2,[res_ptr+0]
+ addxcc %g3,%o5,%g3
+ st %g3,[res_ptr+4]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-2,n
+ add s1_ptr,8,s1_ptr
+ add s2_ptr,8,s2_ptr
+ add res_ptr,8,res_ptr
+ bge L(loope2)
+ subcc %g0,%o4,%g0 C restore cy
+L(end2):
+ andcc n,1,%g0
+ be L(ret2)
+ subcc %g0,%o4,%g0 C restore cy
+C Add last limb
+L(jone):
+ ld [s1_ptr],%g4
+ ld [s2_ptr],%g2
+ addxcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+
+L(ret2):
+ retl
+ addx %g0,%g0,%o0 C return carry-out from most sign. limb
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc32/addmul_1.asm b/rts/gmp/mpn/sparc32/addmul_1.asm
new file mode 100644
index 0000000000..80c94e4251
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/addmul_1.asm
@@ -0,0 +1,146 @@
+dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl result to a second limb vector.
+
+dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ C Make S1_PTR and RES_PTR point at the end of their blocks
+ C and put (- 4 x SIZE) in index/loop counter.
+ sll %o2,2,%o2
+ add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval
+ add %o1,%o2,%o1
+ sub %g0,%o2,%o2
+
+ cmp %o3,0xfff
+ bgu L(large)
+ nop
+
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ b L(0)
+ add %o4,-4,%o4
+L(loop0):
+ addcc %o5,%g1,%g1
+ ld [%o1+%o2],%o5
+ addx %o0,%g0,%o0
+ st %g1,[%o4+%o2]
+L(0): wr %g0,%o3,%y
+ sra %o5,31,%g2
+ and %o3,%g2,%g2
+ andcc %g1,0,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,0,%g1
+ sra %g1,20,%g4
+ sll %g1,12,%g1
+ rd %y,%g3
+ srl %g3,20,%g3
+ or %g1,%g3,%g1
+
+ addcc %g1,%o0,%g1
+ addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb
+ addcc %o2,4,%o2 C loop counter
+ bne L(loop0)
+ ld [%o4+%o2],%o5
+
+ addcc %o5,%g1,%g1
+ addx %o0,%g0,%o0
+ retl
+ st %g1,[%o4+%o2]
+
+L(large):
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0
+ b L(1)
+ add %o4,-4,%o4
+L(loop):
+ addcc %o5,%g3,%g3
+ ld [%o1+%o2],%o5
+ addx %o0,%g0,%o0
+ st %g3,[%o4+%o2]
+L(1): wr %g0,%o5,%y
+ and %o5,%g4,%g2
+ andcc %g0,%g0,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%g0,%g1
+ rd %y,%g3
+ addcc %g3,%o0,%g3
+ addx %g2,%g1,%o0
+ addcc %o2,4,%o2
+ bne L(loop)
+ ld [%o4+%o2],%o5
+
+ addcc %o5,%g3,%g3
+ addx %o0,%g0,%o0
+ retl
+ st %g3,[%o4+%o2]
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/lshift.asm b/rts/gmp/mpn/sparc32/lshift.asm
new file mode 100644
index 0000000000..529733ac2d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/lshift.asm
@@ -0,0 +1,97 @@
+dnl SPARC mpn_lshift -- Shift a number left.
+dnl
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr %o0
+C src_ptr %o1
+C size %o2
+C cnt %o3
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ sll %o2,2,%g1
+ add %o1,%g1,%o1 C make %o1 point at end of src
+ ld [%o1-4],%g2 C load first limb
+ sub %g0,%o3,%o5 C negate shift count
+ add %o0,%g1,%o0 C make %o0 point at end of res
+ add %o2,-1,%o2
+ andcc %o2,4-1,%g4 C number of limbs in first loop
+ srl %g2,%o5,%g1 C compute function result
+ be L(0) C if multiple of 4 limbs, skip first loop
+ st %g1,[%sp+80]
+
+ sub %o2,%g4,%o2 C adjust count for main loop
+
+L(loop0):
+ ld [%o1-8],%g3
+ add %o0,-4,%o0
+ add %o1,-4,%o1
+ addcc %g4,-1,%g4
+ sll %g2,%o3,%o4
+ srl %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ bne L(loop0)
+ st %o4,[%o0+0]
+
+L(0): tst %o2
+ be L(end)
+ nop
+
+L(loop):
+ ld [%o1-8],%g3
+ add %o0,-16,%o0
+ addcc %o2,-4,%o2
+ sll %g2,%o3,%o4
+ srl %g3,%o5,%g1
+
+ ld [%o1-12],%g2
+ sll %g3,%o3,%g4
+ or %o4,%g1,%o4
+ st %o4,[%o0+12]
+ srl %g2,%o5,%g1
+
+ ld [%o1-16],%g3
+ sll %g2,%o3,%o4
+ or %g4,%g1,%g4
+ st %g4,[%o0+8]
+ srl %g3,%o5,%g1
+
+ ld [%o1-20],%g2
+ sll %g3,%o3,%g4
+ or %o4,%g1,%o4
+ st %o4,[%o0+4]
+ srl %g2,%o5,%g1
+
+ add %o1,-16,%o1
+ or %g4,%g1,%g4
+ bne L(loop)
+ st %g4,[%o0+0]
+
+L(end): sll %g2,%o3,%g2
+ st %g2,[%o0-4]
+ retl
+ ld [%sp+80],%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc32/mul_1.asm b/rts/gmp/mpn/sparc32/mul_1.asm
new file mode 100644
index 0000000000..e5fedeabaa
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/mul_1.asm
@@ -0,0 +1,137 @@
+dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ C Make S1_PTR and RES_PTR point at the end of their blocks
+ C and put (- 4 x SIZE) in index/loop counter.
+ sll %o2,2,%o2
+ add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval
+ add %o1,%o2,%o1
+ sub %g0,%o2,%o2
+
+ cmp %o3,0xfff
+ bgu L(large)
+ nop
+
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ b L(0)
+ add %o4,-4,%o4
+L(loop0):
+ st %g1,[%o4+%o2]
+L(0): wr %g0,%o3,%y
+ sra %o5,31,%g2
+ and %o3,%g2,%g2
+ andcc %g1,0,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,0,%g1
+ sra %g1,20,%g4
+ sll %g1,12,%g1
+ rd %y,%g3
+ srl %g3,20,%g3
+ or %g1,%g3,%g1
+
+ addcc %g1,%o0,%g1
+ addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb
+ addcc %o2,4,%o2 C loop counter
+ bne,a L(loop0)
+ ld [%o1+%o2],%o5
+
+ retl
+ st %g1,[%o4+%o2]
+
+
+L(large):
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0
+ b L(1)
+ add %o4,-4,%o4
+L(loop):
+ st %g3,[%o4+%o2]
+L(1): wr %g0,%o5,%y
+ and %o5,%g4,%g2 C g2 = S1_LIMB iff S2_LIMB < 0, else 0
+ andcc %g0,%g0,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%g0,%g1
+ rd %y,%g3
+ addcc %g3,%o0,%g3
+ addx %g2,%g1,%o0 C add sign-compensation and cy to hi limb
+ addcc %o2,4,%o2 C loop counter
+ bne,a L(loop)
+ ld [%o1+%o2],%o5
+
+ retl
+ st %g3,[%o4+%o2]
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/rshift.asm b/rts/gmp/mpn/sparc32/rshift.asm
new file mode 100644
index 0000000000..9187dbaa6f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/rshift.asm
@@ -0,0 +1,93 @@
+dnl SPARC mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr %o0
+C src_ptr %o1
+C size %o2
+C cnt %o3
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ld [%o1],%g2 C load first limb
+ sub %g0,%o3,%o5 C negate shift count
+ add %o2,-1,%o2
+ andcc %o2,4-1,%g4 C number of limbs in first loop
+ sll %g2,%o5,%g1 C compute function result
+ be L(0) C if multiple of 4 limbs, skip first loop
+ st %g1,[%sp+80]
+
+ sub %o2,%g4,%o2 C adjust count for main loop
+
+L(loop0):
+ ld [%o1+4],%g3
+ add %o0,4,%o0
+ add %o1,4,%o1
+ addcc %g4,-1,%g4
+ srl %g2,%o3,%o4
+ sll %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ bne L(loop0)
+ st %o4,[%o0-4]
+
+L(0): tst %o2
+ be L(end)
+ nop
+
+L(loop):
+ ld [%o1+4],%g3
+ add %o0,16,%o0
+ addcc %o2,-4,%o2
+ srl %g2,%o3,%o4
+ sll %g3,%o5,%g1
+
+ ld [%o1+8],%g2
+ srl %g3,%o3,%g4
+ or %o4,%g1,%o4
+ st %o4,[%o0-16]
+ sll %g2,%o5,%g1
+
+ ld [%o1+12],%g3
+ srl %g2,%o3,%o4
+ or %g4,%g1,%g4
+ st %g4,[%o0-12]
+ sll %g3,%o5,%g1
+
+ ld [%o1+16],%g2
+ srl %g3,%o3,%g4
+ or %o4,%g1,%o4
+ st %o4,[%o0-8]
+ sll %g2,%o5,%g1
+
+ add %o1,16,%o1
+ or %g4,%g1,%g4
+ bne L(loop)
+ st %g4,[%o0-4]
+
+L(end): srl %g2,%o3,%g2
+ st %g2,[%o0-0]
+ retl
+ ld [%sp+80],%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc32/sub_n.asm b/rts/gmp/mpn/sparc32/sub_n.asm
new file mode 100644
index 0000000000..071909a1b6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/sub_n.asm
@@ -0,0 +1,326 @@
+dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ xor s2_ptr,res_ptr,%g1
+ andcc %g1,4,%g0
+ bne L(1) C branch if alignment differs
+ nop
+C ** V1a **
+ andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0
+ be L(v1) C if no, branch
+ nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+ ld [s1_ptr],%g4
+ add s1_ptr,4,s1_ptr
+ ld [s2_ptr],%g2
+ add s2_ptr,4,s2_ptr
+ add n,-1,n
+ subcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+ add res_ptr,4,res_ptr
+L(v1): addx %g0,%g0,%o4 C save cy in register
+ cmp n,2 C if n < 2 ...
+ bl L(end2) C ... branch to tail code
+ subcc %g0,%o4,%g0 C restore cy
+
+ ld [s1_ptr+0],%g4
+ addcc n,-10,n
+ ld [s1_ptr+4],%g1
+ ldd [s2_ptr+0],%g2
+ blt L(fin1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+ subxcc %g4,%g2,%o4
+ ld [s1_ptr+8],%g4
+ subxcc %g1,%g3,%o5
+ ld [s1_ptr+12],%g1
+ ldd [s2_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ subxcc %g4,%g2,%o4
+ ld [s1_ptr+16],%g4
+ subxcc %g1,%g3,%o5
+ ld [s1_ptr+20],%g1
+ ldd [s2_ptr+16],%g2
+ std %o4,[res_ptr+8]
+ subxcc %g4,%g2,%o4
+ ld [s1_ptr+24],%g4
+ subxcc %g1,%g3,%o5
+ ld [s1_ptr+28],%g1
+ ldd [s2_ptr+24],%g2
+ std %o4,[res_ptr+16]
+ subxcc %g4,%g2,%o4
+ ld [s1_ptr+32],%g4
+ subxcc %g1,%g3,%o5
+ ld [s1_ptr+36],%g1
+ ldd [s2_ptr+32],%g2
+ std %o4,[res_ptr+24]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ add s1_ptr,32,s1_ptr
+ add s2_ptr,32,s2_ptr
+ add res_ptr,32,res_ptr
+ bge L(loop1)
+ subcc %g0,%o4,%g0 C restore cy
+
+L(fin1):
+ addcc n,8-2,n
+ blt L(end1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+ subxcc %g4,%g2,%o4
+ ld [s1_ptr+8],%g4
+ subxcc %g1,%g3,%o5
+ ld [s1_ptr+12],%g1
+ ldd [s2_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-2,n
+ add s1_ptr,8,s1_ptr
+ add s2_ptr,8,s2_ptr
+ add res_ptr,8,res_ptr
+ bge L(loope1)
+ subcc %g0,%o4,%g0 C restore cy
+L(end1):
+ subxcc %g4,%g2,%o4
+ subxcc %g1,%g3,%o5
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+
+ andcc n,1,%g0
+ be L(ret1)
+ subcc %g0,%o4,%g0 C restore cy
+C Add last limb
+ ld [s1_ptr+8],%g4
+ ld [s2_ptr+8],%g2
+ subxcc %g4,%g2,%o4
+ st %o4,[res_ptr+8]
+
+L(ret1):
+ retl
+ addx %g0,%g0,%o0 C return carry-out from most sign. limb
+
+L(1): xor s1_ptr,res_ptr,%g1
+ andcc %g1,4,%g0
+ bne L(2)
+ nop
+C ** V1b **
+ andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0
+ be L(v1b) C if no, branch
+ nop
+C Add least significant limb separately to align res_ptr and s1_ptr
+ ld [s2_ptr],%g4
+ add s2_ptr,4,s2_ptr
+ ld [s1_ptr],%g2
+ add s1_ptr,4,s1_ptr
+ add n,-1,n
+ subcc %g2,%g4,%o4
+ st %o4,[res_ptr]
+ add res_ptr,4,res_ptr
+L(v1b): addx %g0,%g0,%o4 C save cy in register
+ cmp n,2 C if n < 2 ...
+ bl L(end2) C ... branch to tail code
+ subcc %g0,%o4,%g0 C restore cy
+
+ ld [s2_ptr+0],%g4
+ addcc n,-10,n
+ ld [s2_ptr+4],%g1
+ ldd [s1_ptr+0],%g2
+ blt L(fin1b)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1b):
+ subxcc %g2,%g4,%o4
+ ld [s2_ptr+8],%g4
+ subxcc %g3,%g1,%o5
+ ld [s2_ptr+12],%g1
+ ldd [s1_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ subxcc %g2,%g4,%o4
+ ld [s2_ptr+16],%g4
+ subxcc %g3,%g1,%o5
+ ld [s2_ptr+20],%g1
+ ldd [s1_ptr+16],%g2
+ std %o4,[res_ptr+8]
+ subxcc %g2,%g4,%o4
+ ld [s2_ptr+24],%g4
+ subxcc %g3,%g1,%o5
+ ld [s2_ptr+28],%g1
+ ldd [s1_ptr+24],%g2
+ std %o4,[res_ptr+16]
+ subxcc %g2,%g4,%o4
+ ld [s2_ptr+32],%g4
+ subxcc %g3,%g1,%o5
+ ld [s2_ptr+36],%g1
+ ldd [s1_ptr+32],%g2
+ std %o4,[res_ptr+24]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ add s1_ptr,32,s1_ptr
+ add s2_ptr,32,s2_ptr
+ add res_ptr,32,res_ptr
+ bge L(loop1b)
+ subcc %g0,%o4,%g0 C restore cy
+
+L(fin1b):
+ addcc n,8-2,n
+ blt L(end1b)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1b):
+ subxcc %g2,%g4,%o4
+ ld [s2_ptr+8],%g4
+ subxcc %g3,%g1,%o5
+ ld [s2_ptr+12],%g1
+ ldd [s1_ptr+8],%g2
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-2,n
+ add s1_ptr,8,s1_ptr
+ add s2_ptr,8,s2_ptr
+ add res_ptr,8,res_ptr
+ bge L(loope1b)
+ subcc %g0,%o4,%g0 C restore cy
+L(end1b):
+ subxcc %g2,%g4,%o4
+ subxcc %g3,%g1,%o5
+ std %o4,[res_ptr+0]
+ addx %g0,%g0,%o4 C save cy in register
+
+ andcc n,1,%g0
+ be L(ret1b)
+ subcc %g0,%o4,%g0 C restore cy
+C Add last limb
+ ld [s2_ptr+8],%g4
+ ld [s1_ptr+8],%g2
+ subxcc %g2,%g4,%o4
+ st %o4,[res_ptr+8]
+
+L(ret1b):
+ retl
+ addx %g0,%g0,%o0 C return carry-out from most sign. limb
+
+C ** V2 **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ. Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2): cmp n,1
+ be L(jone)
+ nop
+ andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0
+ be L(v2) C if no, branch
+ nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+ ld [s1_ptr],%g4
+ add s1_ptr,4,s1_ptr
+ ld [s2_ptr],%g2
+ add s2_ptr,4,s2_ptr
+ add n,-1,n
+ subcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+ add res_ptr,4,res_ptr
+
+L(v2): addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ blt L(fin2)
+ subcc %g0,%o4,%g0 C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+ ldd [s1_ptr+0],%g2
+ ldd [s2_ptr+0],%o4
+ subxcc %g2,%o4,%g2
+ st %g2,[res_ptr+0]
+ subxcc %g3,%o5,%g3
+ st %g3,[res_ptr+4]
+ ldd [s1_ptr+8],%g2
+ ldd [s2_ptr+8],%o4
+ subxcc %g2,%o4,%g2
+ st %g2,[res_ptr+8]
+ subxcc %g3,%o5,%g3
+ st %g3,[res_ptr+12]
+ ldd [s1_ptr+16],%g2
+ ldd [s2_ptr+16],%o4
+ subxcc %g2,%o4,%g2
+ st %g2,[res_ptr+16]
+ subxcc %g3,%o5,%g3
+ st %g3,[res_ptr+20]
+ ldd [s1_ptr+24],%g2
+ ldd [s2_ptr+24],%o4
+ subxcc %g2,%o4,%g2
+ st %g2,[res_ptr+24]
+ subxcc %g3,%o5,%g3
+ st %g3,[res_ptr+28]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-8,n
+ add s1_ptr,32,s1_ptr
+ add s2_ptr,32,s2_ptr
+ add res_ptr,32,res_ptr
+ bge L(loop2)
+ subcc %g0,%o4,%g0 C restore cy
+
+L(fin2):
+ addcc n,8-2,n
+ blt L(end2)
+ subcc %g0,%o4,%g0 C restore cy
+L(loope2):
+ ldd [s1_ptr+0],%g2
+ ldd [s2_ptr+0],%o4
+ subxcc %g2,%o4,%g2
+ st %g2,[res_ptr+0]
+ subxcc %g3,%o5,%g3
+ st %g3,[res_ptr+4]
+ addx %g0,%g0,%o4 C save cy in register
+ addcc n,-2,n
+ add s1_ptr,8,s1_ptr
+ add s2_ptr,8,s2_ptr
+ add res_ptr,8,res_ptr
+ bge L(loope2)
+ subcc %g0,%o4,%g0 C restore cy
+L(end2):
+ andcc n,1,%g0
+ be L(ret2)
+ subcc %g0,%o4,%g0 C restore cy
+C Add last limb
+L(jone):
+ ld [s1_ptr],%g4
+ ld [s2_ptr],%g2
+ subxcc %g4,%g2,%o4
+ st %o4,[res_ptr]
+
+L(ret2):
+ retl
+ addx %g0,%g0,%o0 C return carry-out from most sign. limb
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc32/submul_1.asm b/rts/gmp/mpn/sparc32/submul_1.asm
new file mode 100644
index 0000000000..12abd844ce
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/submul_1.asm
@@ -0,0 +1,146 @@
+dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ C Make S1_PTR and RES_PTR point at the end of their blocks
+ C and put (- 4 x SIZE) in index/loop counter.
+ sll %o2,2,%o2
+ add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval
+ add %o1,%o2,%o1
+ sub %g0,%o2,%o2
+
+ cmp %o3,0xfff
+ bgu L(large)
+ nop
+
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ b L(0)
+ add %o4,-4,%o4
+L(loop0):
+ subcc %o5,%g1,%g1
+ ld [%o1+%o2],%o5
+ addx %o0,%g0,%o0
+ st %g1,[%o4+%o2]
+L(0): wr %g0,%o3,%y
+ sra %o5,31,%g2
+ and %o3,%g2,%g2
+ andcc %g1,0,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,%o5,%g1
+ mulscc %g1,0,%g1
+ sra %g1,20,%g4
+ sll %g1,12,%g1
+ rd %y,%g3
+ srl %g3,20,%g3
+ or %g1,%g3,%g1
+
+ addcc %g1,%o0,%g1
+ addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb
+ addcc %o2,4,%o2 C loop counter
+ bne L(loop0)
+ ld [%o4+%o2],%o5
+
+ subcc %o5,%g1,%g1
+ addx %o0,%g0,%o0
+ retl
+ st %g1,[%o4+%o2]
+
+L(large):
+ ld [%o1+%o2],%o5
+ mov 0,%o0
+ sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0
+ b L(1)
+ add %o4,-4,%o4
+L(loop):
+ subcc %o5,%g3,%g3
+ ld [%o1+%o2],%o5
+ addx %o0,%g0,%o0
+ st %g3,[%o4+%o2]
+L(1): wr %g0,%o5,%y
+ and %o5,%g4,%g2
+ andcc %g0,%g0,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%o3,%g1
+ mulscc %g1,%g0,%g1
+ rd %y,%g3
+ addcc %g3,%o0,%g3
+ addx %g2,%g1,%o0
+ addcc %o2,4,%o2
+ bne L(loop)
+ ld [%o4+%o2],%o5
+
+ subcc %o5,%g3,%g3
+ addx %o0,%g0,%o0
+ retl
+ st %g3,[%o4+%o2]
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/udiv_fp.asm b/rts/gmp/mpn/sparc32/udiv_fp.asm
new file mode 100644
index 0000000000..e340e147d2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_fp.asm
@@ -0,0 +1,158 @@
+dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl This is for v7 CPUs with a floating-point unit.
+
+dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr i0
+C n1 i1
+C n0 i2
+C d i3
+
+ASM_START()
+
+ifdef(`PIC',
+` TEXT
+L(getpc):
+ retl
+ nop')
+
+ TEXT
+ ALIGN(8)
+L(C0): .double 0r4294967296
+L(C1): .double 0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+ save %sp,-104,%sp
+ st %i1,[%fp-8]
+ ld [%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc): call L(getpc) C put address of this insn in %o7
+ ldd [%o7+L(C0)-L(pc)],%f8',
+` sethi %hi(L(C0)),%o7
+ ldd [%o7+%lo(L(C0))],%f8')
+
+ fitod %f10,%f4
+ cmp %i1,0
+ bge L(248)
+ mov %i0,%i5
+ faddd %f4,%f8,%f4
+L(248):
+ st %i2,[%fp-8]
+ ld [%fp-8],%f10
+ fmuld %f4,%f8,%f6
+ cmp %i2,0
+ bge L(249)
+ fitod %f10,%f2
+ faddd %f2,%f8,%f2
+L(249):
+ st %i3,[%fp-8]
+ faddd %f6,%f2,%f2
+ ld [%fp-8],%f10
+ cmp %i3,0
+ bge L(250)
+ fitod %f10,%f4
+ faddd %f4,%f8,%f4
+L(250):
+ fdivd %f2,%f4,%f2
+
+ifdef(`PIC',
+` ldd [%o7+L(C1)-L(pc)],%f4',
+` sethi %hi(L(C1)),%o7
+ ldd [%o7+%lo(L(C1))],%f4')
+
+ fcmped %f2,%f4
+ nop
+ fbge,a L(251)
+ fsubd %f2,%f4,%f2
+ fdtoi %f2,%f2
+ st %f2,[%fp-8]
+ b L(252)
+ ld [%fp-8],%i4
+L(251):
+ fdtoi %f2,%f2
+ st %f2,[%fp-8]
+ ld [%fp-8],%i4
+ sethi %hi(-2147483648),%g2
+ xor %i4,%g2,%i4
+L(252):
+ wr %g0,%i4,%y
+ sra %i3,31,%g2
+ and %i4,%g2,%g2
+ andcc %g0,0,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,%i3,%g1
+ mulscc %g1,0,%g1
+ add %g1,%g2,%i0
+ rd %y,%g3
+ subcc %i2,%g3,%o7
+ subxcc %i1,%i0,%g0
+ be L(253)
+ cmp %o7,%i3
+
+ add %i4,-1,%i0
+ add %o7,%i3,%o7
+ st %o7,[%i5]
+ ret
+ restore
+L(253):
+ blu L(246)
+ mov %i4,%i0
+ add %i4,1,%i0
+ sub %o7,%i3,%o7
+L(246):
+ st %o7,[%i5]
+ ret
+ restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/udiv_nfp.asm b/rts/gmp/mpn/sparc32/udiv_nfp.asm
new file mode 100644
index 0000000000..ae19f4c6e9
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_nfp.asm
@@ -0,0 +1,193 @@
+dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl This is for v7 CPUs without a floating-point unit.
+
+dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr o0
+C n1 o1
+C n0 o2
+C d o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+ tst %o3
+ bneg L(largedivisor)
+ mov 8,%g1
+
+ b L(p1)
+ addxcc %o2,%o2,%o2
+
+L(plop):
+ bcc L(n1)
+ addxcc %o2,%o2,%o2
+L(p1): addx %o1,%o1,%o1
+ subcc %o1,%o3,%o4
+ bcc L(n2)
+ addxcc %o2,%o2,%o2
+L(p2): addx %o1,%o1,%o1
+ subcc %o1,%o3,%o4
+ bcc L(n3)
+ addxcc %o2,%o2,%o2
+L(p3): addx %o1,%o1,%o1
+ subcc %o1,%o3,%o4
+ bcc L(n4)
+ addxcc %o2,%o2,%o2
+L(p4): addx %o1,%o1,%o1
+ addcc %g1,-1,%g1
+ bne L(plop)
+ subcc %o1,%o3,%o4
+ bcc L(n5)
+ addxcc %o2,%o2,%o2
+L(p5): st %o1,[%o0]
+ retl
+ xnor %g0,%o2,%o0
+
+L(nlop):
+ bcc L(p1)
+ addxcc %o2,%o2,%o2
+L(n1): addx %o4,%o4,%o4
+ subcc %o4,%o3,%o1
+ bcc L(p2)
+ addxcc %o2,%o2,%o2
+L(n2): addx %o4,%o4,%o4
+ subcc %o4,%o3,%o1
+ bcc L(p3)
+ addxcc %o2,%o2,%o2
+L(n3): addx %o4,%o4,%o4
+ subcc %o4,%o3,%o1
+ bcc L(p4)
+ addxcc %o2,%o2,%o2
+L(n4): addx %o4,%o4,%o4
+ addcc %g1,-1,%g1
+ bne L(nlop)
+ subcc %o4,%o3,%o1
+ bcc L(p5)
+ addxcc %o2,%o2,%o2
+L(n5): st %o4,[%o0]
+ retl
+ xnor %g0,%o2,%o0
+
+L(largedivisor):
+ and %o2,1,%o5 C %o5 = n0 & 1
+
+ srl %o2,1,%o2
+ sll %o1,31,%g2
+ or %g2,%o2,%o2 C %o2 = lo(n1n0 >> 1)
+ srl %o1,1,%o1 C %o1 = hi(n1n0 >> 1)
+
+ and %o3,1,%g2
+ srl %o3,1,%g3 C %g3 = floor(d / 2)
+ add %g3,%g2,%g3 C %g3 = ceil(d / 2)
+
+ b L(Lp1)
+ addxcc %o2,%o2,%o2
+
+L(Lplop):
+ bcc L(Ln1)
+ addxcc %o2,%o2,%o2
+L(Lp1): addx %o1,%o1,%o1
+ subcc %o1,%g3,%o4
+ bcc L(Ln2)
+ addxcc %o2,%o2,%o2
+L(Lp2): addx %o1,%o1,%o1
+ subcc %o1,%g3,%o4
+ bcc L(Ln3)
+ addxcc %o2,%o2,%o2
+L(Lp3): addx %o1,%o1,%o1
+ subcc %o1,%g3,%o4
+ bcc L(Ln4)
+ addxcc %o2,%o2,%o2
+L(Lp4): addx %o1,%o1,%o1
+ addcc %g1,-1,%g1
+ bne L(Lplop)
+ subcc %o1,%g3,%o4
+ bcc L(Ln5)
+ addxcc %o2,%o2,%o2
+L(Lp5): add %o1,%o1,%o1 C << 1
+ tst %g2
+ bne L(oddp)
+ add %o5,%o1,%o1
+ st %o1,[%o0]
+ retl
+ xnor %g0,%o2,%o0
+
+L(Lnlop):
+ bcc L(Lp1)
+ addxcc %o2,%o2,%o2
+L(Ln1): addx %o4,%o4,%o4
+ subcc %o4,%g3,%o1
+ bcc L(Lp2)
+ addxcc %o2,%o2,%o2
+L(Ln2): addx %o4,%o4,%o4
+ subcc %o4,%g3,%o1
+ bcc L(Lp3)
+ addxcc %o2,%o2,%o2
+L(Ln3): addx %o4,%o4,%o4
+ subcc %o4,%g3,%o1
+ bcc L(Lp4)
+ addxcc %o2,%o2,%o2
+L(Ln4): addx %o4,%o4,%o4
+ addcc %g1,-1,%g1
+ bne L(Lnlop)
+ subcc %o4,%g3,%o1
+ bcc L(Lp5)
+ addxcc %o2,%o2,%o2
+L(Ln5): add %o4,%o4,%o4 C << 1
+ tst %g2
+ bne L(oddn)
+ add %o5,%o4,%o4
+ st %o4,[%o0]
+ retl
+ xnor %g0,%o2,%o0
+
+L(oddp):
+ xnor %g0,%o2,%o2
+ C q' in %o2. r' in %o1
+ addcc %o1,%o2,%o1
+ bcc L(Lp6)
+ addx %o2,0,%o2
+ sub %o1,%o3,%o1
+L(Lp6): subcc %o1,%o3,%g0
+ bcs L(Lp7)
+ subx %o2,-1,%o2
+ sub %o1,%o3,%o1
+L(Lp7): st %o1,[%o0]
+ retl
+ mov %o2,%o0
+
+L(oddn):
+ xnor %g0,%o2,%o2
+ C q' in %o2. r' in %o4
+ addcc %o4,%o2,%o4
+ bcc L(Ln6)
+ addx %o2,0,%o2
+ sub %o4,%o3,%o4
+L(Ln6): subcc %o4,%o3,%g0
+ bcs L(Ln7)
+ subx %o2,-1,%o2
+ sub %o4,%o3,%o4
+L(Ln7): st %o4,[%o0]
+ retl
+ mov %o2,%o0
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/umul.asm b/rts/gmp/mpn/sparc32/umul.asm
new file mode 100644
index 0000000000..efa56851d6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/umul.asm
@@ -0,0 +1,68 @@
+dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+ wr %g0,%o1,%y
+ sra %o2,31,%g2 C Don't move this insn
+ and %o1,%g2,%g2 C Don't move this insn
+ andcc %g0,0,%g1 C Don't move this insn
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,%o2,%g1
+ mulscc %g1,0,%g1
+ rd %y,%g3
+ st %g3,[%o0]
+ retl
+ add %g1,%g2,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
new file mode 100644
index 0000000000..da44644b51
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
@@ -0,0 +1,122 @@
+dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ orcc %g0,%g0,%g2
+ ld [%o1+0],%o4 C 1
+
+ sll %o2,4,%g1
+ and %g1,(4-1)<<4,%g1
+ifdef(`PIC',
+` mov %o7,%g4 C Save return address register
+0: call 1f
+ add %o7,L(1)-0b,%g3
+1: mov %g4,%o7 C Restore return address register
+',
+` sethi %hi(L(1)),%g3
+ or %g3,%lo(L(1)),%g3
+')
+ jmp %g3+%g1
+ nop
+L(1):
+L(L00): add %o0,-4,%o0
+ b L(loop00) C 4, 8, 12, ...
+ add %o1,-4,%o1
+ nop
+L(L01): b L(loop01) C 1, 5, 9, ...
+ nop
+ nop
+ nop
+L(L10): add %o0,-12,%o0 C 2, 6, 10, ...
+ b L(loop10)
+ add %o1,4,%o1
+ nop
+L(L11): add %o0,-8,%o0 C 3, 7, 11, ...
+ b L(loop11)
+ add %o1,-8,%o1
+ nop
+
+L(loop):
+ addcc %g3,%g2,%g3 C 1
+ ld [%o1+4],%o4 C 2
+ rd %y,%g2 C 1
+ addx %g0,%g2,%g2
+ ld [%o0+0],%g1 C 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+0] C 1
+L(loop00):
+ umul %o4,%o3,%g3 C 2
+ ld [%o0+4],%g1 C 2
+ addxcc %g3,%g2,%g3 C 2
+ ld [%o1+8],%o4 C 3
+ rd %y,%g2 C 2
+ addx %g0,%g2,%g2
+ nop
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+4] C 2
+L(loop11):
+ umul %o4,%o3,%g3 C 3
+ addxcc %g3,%g2,%g3 C 3
+ ld [%o1+12],%o4 C 4
+ rd %y,%g2 C 3
+ add %o1,16,%o1
+ addx %g0,%g2,%g2
+ ld [%o0+8],%g1 C 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+8] C 3
+L(loop10):
+ umul %o4,%o3,%g3 C 4
+ addxcc %g3,%g2,%g3 C 4
+ ld [%o1+0],%o4 C 1
+ rd %y,%g2 C 4
+ addx %g0,%g2,%g2
+ ld [%o0+12],%g1 C 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+12] C 4
+ add %o0,16,%o0
+ addx %g0,%g2,%g2
+L(loop01):
+ addcc %o2,-4,%o2
+ bg L(loop)
+ umul %o4,%o3,%g3 C 1
+
+ addcc %g3,%g2,%g3 C 4
+ rd %y,%g2 C 4
+ addx %g0,%g2,%g2
+ ld [%o0+0],%g1 C 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+0] C 4
+ addx %g0,%g2,%o0
+
+ retl
+ nop
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/mul_1.asm b/rts/gmp/mpn/sparc32/v8/mul_1.asm
new file mode 100644
index 0000000000..801247553a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/mul_1.asm
@@ -0,0 +1,103 @@
+dnl SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and
+dnl store the product in a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ sll %o2,4,%g1
+ and %g1,(4-1)<<4,%g1
+ifdef(`PIC',
+` mov %o7,%g4 C Save return address register
+0: call 1f
+ add %o7,L(1)-0b,%g3
+1: mov %g4,%o7 C Restore return address register
+',
+` sethi %hi(L(1)),%g3
+ or %g3,%lo(L(1)),%g3
+')
+ jmp %g3+%g1
+ ld [%o1+0],%o4 C 1
+L(1):
+L(L00): add %o0,-4,%o0
+ add %o1,-4,%o1
+ b L(loop00) C 4, 8, 12, ...
+ orcc %g0,%g0,%g2
+L(L01): b L(loop01) C 1, 5, 9, ...
+ orcc %g0,%g0,%g2
+ nop
+ nop
+L(L10): add %o0,-12,%o0 C 2, 6, 10, ...
+ add %o1,4,%o1
+ b L(loop10)
+ orcc %g0,%g0,%g2
+ nop
+L(L11): add %o0,-8,%o0 C 3, 7, 11, ...
+ add %o1,-8,%o1
+ b L(loop11)
+ orcc %g0,%g0,%g2
+
+L(loop):
+ addcc %g3,%g2,%g3 C 1
+ ld [%o1+4],%o4 C 2
+ st %g3,[%o0+0] C 1
+ rd %y,%g2 C 1
+L(loop00):
+ umul %o4,%o3,%g3 C 2
+ addxcc %g3,%g2,%g3 C 2
+ ld [%o1+8],%o4 C 3
+ st %g3,[%o0+4] C 2
+ rd %y,%g2 C 2
+L(loop11):
+ umul %o4,%o3,%g3 C 3
+ addxcc %g3,%g2,%g3 C 3
+ ld [%o1+12],%o4 C 4
+ add %o1,16,%o1
+ st %g3,[%o0+8] C 3
+ rd %y,%g2 C 3
+L(loop10):
+ umul %o4,%o3,%g3 C 4
+ addxcc %g3,%g2,%g3 C 4
+ ld [%o1+0],%o4 C 1
+ st %g3,[%o0+12] C 4
+ add %o0,16,%o0
+ rd %y,%g2 C 4
+ addx %g0,%g2,%g2
+L(loop01):
+ addcc %o2,-4,%o2
+ bg L(loop)
+ umul %o4,%o3,%g3 C 1
+
+ addcc %g3,%g2,%g3 C 4
+ st %g3,[%o0+0] C 4
+ rd %y,%g2 C 4
+
+ retl
+ addx %g0,%g2,%o0
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/submul_1.asm b/rts/gmp/mpn/sparc32/v8/submul_1.asm
new file mode 100644
index 0000000000..9ed132f4c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/submul_1.asm
@@ -0,0 +1,58 @@
+dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr o0
+C s1_ptr o1
+C size o2
+C s2_limb o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sub %g0,%o2,%o2 C negate ...
+ sll %o2,2,%o2 C ... and scale size
+ sub %o1,%o2,%o1 C o1 is offset s1_ptr
+ sub %o0,%o2,%g1 C g1 is offset res_ptr
+
+ mov 0,%o0 C clear cy_limb
+
+L(loop):
+ ld [%o1+%o2],%o4
+ ld [%g1+%o2],%g2
+ umul %o4,%o3,%o5
+ rd %y,%g3
+ addcc %o5,%o0,%o5
+ addx %g3,0,%o0
+ subcc %g2,%o5,%g2
+ addx %o0,0,%o0
+ st %g2,[%g1+%o2]
+
+ addcc %o2,4,%o2
+ bne L(loop)
+ nop
+
+ retl
+ nop
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
new file mode 100644
index 0000000000..0d5e8d415d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
@@ -0,0 +1,122 @@
+dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl This is for SuperSPARC only, to compensate for its semi-functional
+dnl udiv instruction.
+
+dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr i0
+C n1 i1
+C n0 i2
+C d i3
+
+ASM_START()
+
+ifdef(`PIC',
+` TEXT
+L(getpc):
+ retl
+ nop')
+
+ TEXT
+ ALIGN(8)
+L(C0): .double 0r4294967296
+L(C1): .double 0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+ save %sp,-104,%sp
+ st %i1,[%fp-8]
+ ld [%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc): call L(getpc) C put address of this insn in %o7
+ ldd [%o7+L(C0)-L(pc)],%f8',
+` sethi %hi(L(C0)),%o7
+ ldd [%o7+%lo(L(C0))],%f8')
+
+ fitod %f10,%f4
+ cmp %i1,0
+ bge L(248)
+ mov %i0,%i5
+ faddd %f4,%f8,%f4
+L(248):
+ st %i2,[%fp-8]
+ ld [%fp-8],%f10
+ fmuld %f4,%f8,%f6
+ cmp %i2,0
+ bge L(249)
+ fitod %f10,%f2
+ faddd %f2,%f8,%f2
+L(249):
+ st %i3,[%fp-8]
+ faddd %f6,%f2,%f2
+ ld [%fp-8],%f10
+ cmp %i3,0
+ bge L(250)
+ fitod %f10,%f4
+ faddd %f4,%f8,%f4
+L(250):
+ fdivd %f2,%f4,%f2
+
+ifdef(`PIC',
+` ldd [%o7+L(C1)-L(pc)],%f4',
+` sethi %hi(L(C1)),%o7
+ ldd [%o7+%lo(L(C1))],%f4')
+
+ fcmped %f2,%f4
+ nop
+ fbge,a L(251)
+ fsubd %f2,%f4,%f2
+ fdtoi %f2,%f2
+ st %f2,[%fp-8]
+ b L(252)
+ ld [%fp-8],%i4
+L(251):
+ fdtoi %f2,%f2
+ st %f2,[%fp-8]
+ ld [%fp-8],%i4
+ sethi %hi(-2147483648),%g2
+ xor %i4,%g2,%i4
+L(252):
+ umul %i3,%i4,%g3
+ rd %y,%i0
+ subcc %i2,%g3,%o7
+ subxcc %i1,%i0,%g0
+ be L(253)
+ cmp %o7,%i3
+
+ add %i4,-1,%i0
+ add %o7,%i3,%o7
+ st %o7,[%i5]
+ ret
+ restore
+L(253):
+ blu L(246)
+ mov %i4,%i0
+ add %i4,1,%i0
+ sub %o7,%i3,%o7
+L(246):
+ st %o7,[%i5]
+ ret
+ restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/v8/umul.asm b/rts/gmp/mpn/sparc32/v8/umul.asm
new file mode 100644
index 0000000000..ae8f692a0a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/umul.asm
@@ -0,0 +1,31 @@
+dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+ umul %o1,%o2,%g2
+ st %g2,[%o0]
+ retl
+ rd %y,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v9/README b/rts/gmp/mpn/sparc32/v9/README
new file mode 100644
index 0000000000..9b39713271
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch. For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000000..c1762cc41f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,288 @@
+dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+PROLOGUE(mpn_addmul_1)
+ save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hi(L(noll)),%g1
+ ld [%g1+%lo(L(noll))],%f10')
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1],%f11
+ subcc %i2,1,%i2
+ be,pn %icc,L(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end2)
+ std %f12,[%fp-16]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end3)
+ std %f12,[%fp-32]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,L(end4)
+ std %f12,[%fp-16]
+
+ b,a L(loopm)
+
+ .align 16
+C BEGIN LOOP
+L(loop):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ subcc %i2,1,%i2
+ be,pn %icc,L(loope)
+ add %i0,4,%i0 C res_ptr++
+L(loopm):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ subcc %i2,1,%i2
+ bne,pt %icc,L(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+
+ fxtod %f10,%f2
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ b,a L(xxx)
+L(loope):
+L(end4):
+ fxtod %f10,%f2
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ b,a L(yyy)
+
+L(end3):
+ fxtod %f10,%f2
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+L(xxx): fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end1):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+L(ret): add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ st %g4,[%i0-4]
+
+ ret
+ restore %g0,%g3,%o0 C sideeffect: put cy in retreg
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000000..f946b900f0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,69 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* These values are for UltraSPARC I, II, and IIi. It is bogus that
+ this file lives in v9, but that will do for now. */
+
+/* Variations in addmul_1 speed make the multiply and square thresholds
+ doubtful. TOOM3_SQR_THRESHOLD had to be estimated here. */
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 200
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 500
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 107
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 146
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 3
+#endif
diff --git a/rts/gmp/mpn/sparc32/v9/mul_1.asm b/rts/gmp/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000000..f8f0fdd8c2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,267 @@
+dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl store the result in a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+PROLOGUE(mpn_mul_1)
+ save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hi(L(noll)),%g1
+ ld [%g1+%lo(L(noll))],%f10')
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1],%f11
+ subcc %i2,1,%i2
+ be,pn %icc,L(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end2)
+ std %f12,[%fp-16]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end3)
+ std %f12,[%fp-32]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,L(end4)
+ std %f12,[%fp-16]
+
+ b,a L(loopm)
+
+ .align 16
+C BEGIN LOOP
+L(loop):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ subcc %i2,1,%i2
+ be,pn %icc,L(loope)
+ add %i0,4,%i0 C res_ptr++
+L(loopm):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ subcc %i2,1,%i2
+ bne,pt %icc,L(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ b,a L(xxx)
+L(loope):
+L(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ b,a L(yyy)
+
+L(end3):
+ fxtod %f10,%f2
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+L(xxx): fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end1):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+L(ret): add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ st %g4,[%i0-4]
+
+ ret
+ restore %g0,%g3,%o0 C sideeffect: put cy in retreg
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/submul_1.asm b/rts/gmp/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000000..6195ea88ea
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,291 @@
+dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+PROLOGUE(mpn_submul_1)
+ save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hi(L(noll)),%g1
+ ld [%g1+%lo(L(noll))],%f10')
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-16]
+ ld [%fp-16],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1],%f11
+ subcc %i2,1,%i2
+ be,pn %icc,L(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end2)
+ std %f12,[%fp-16]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,L(end3)
+ std %f12,[%fp-32]
+
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,L(end4)
+ std %f12,[%fp-16]
+
+ b,a L(loopm)
+
+ .align 16
+C BEGIN LOOP
+L(loop):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ addx %g3,0,%g3
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ subcc %i2,1,%i2
+ be,pn %icc,L(loope)
+ add %i0,4,%i0 C res_ptr++
+L(loopm):
+ fxtod %f10,%f2
+ ld [%i1],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ addx %g3,0,%g3
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ subcc %i2,1,%i2
+ bne,pt %icc,L(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ b,a L(xxx)
+L(loope):
+L(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ b,a L(yyy)
+
+L(end3):
+ fxtod %f10,%f2
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+L(xxx): fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-40]
+ fdtox %f4,%f12
+ std %f12,[%fp-32]
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-40],%g2 C p16
+ ldx [%fp-32],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+ b,a L(ret)
+
+L(end1):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-24]
+ fdtox %f4,%f12
+ std %f12,[%fp-16]
+
+ ld [%i0],%g5
+ ldx [%fp-24],%g2 C p16
+ ldx [%fp-16],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+L(ret): add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ srlx %g4,32,%g3
+ st %l2,[%i0-4]
+
+ addx %g3,%g0,%g3
+ ret
+ restore %g0,%g3,%o0 C sideeffect: put cy in retreg
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle. The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand. It cannot overlap with other instructions. For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions. No conditional move can issue 1-5 cycles after a load. (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions. Likewise
+for integer loads. Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one. For UltraSPARC, they got things backwards and
+optimize for the wrong operand! Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64. Not tested or completed, but
+the pipelines are worked out. Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb. We use a bizarre
+ scheme of compares and branches (with some nops and fnops to align things)
+ and carefully stay away from the instructions intended for this application
+ (i.e., movcs and movcc).
+
+ Using movcc/movcs, even with deep unrolling, seems to get down to 7
+ cycles/limb.
+
+ The most promising approach is to split operands in 32-bit pieces using
+ srlx, then use two addccc, and finally compile the results with sllx+or.
+ The result could run at 5 cycles/limb, I think. It might be possible to
+ do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+ Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+ same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! s1_ptr %o1
+! s2_ptr %o2
+! size %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+ save %sp,-160,%sp
+
+ addcc %g0,%g0,%g0
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(there)
+ nop
+
+ ldx [%i1+0],%l0
+ ldx [%i2+0],%l4
+ ldx [%i1+8],%l1
+ ldx [%i2+8],%l5
+ ldx [%i1+16],%l2
+ ldx [%i2+16],%l6
+ ldx [%i1+24],%l3
+ ldx [%i2+24],%l7
+ add %i1,32,%i1
+ add %i2,32,%i2
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(skip)
+ nop
+ b L(loop1) ! jump instead of executing many NOPs
+ nop
+ ALIGN(32)
+!--------- Start main loop ---------
+L(loop1):
+ addccc %l0,%l4,%g1
+!-
+ srlx %l0,32,%o0
+ ldx [%i1+0],%l0
+!-
+ srlx %l4,32,%o4
+ ldx [%i2+0],%l4
+!-
+ addccc %o0,%o4,%g0
+!-
+ addccc %l1,%l5,%g2
+!-
+ srlx %l1,32,%o1
+ ldx [%i1+8],%l1
+!-
+ srlx %l5,32,%o5
+ ldx [%i2+8],%l5
+!-
+ addccc %o1,%o5,%g0
+!-
+ addccc %l2,%l6,%g3
+!-
+ srlx %l2,32,%o2
+ ldx [%i1+16],%l2
+!-
+ srlx %l6,32,%g5 ! asymmetry
+ ldx [%i2+16],%l6
+!-
+ addccc %o2,%g5,%g0
+!-
+ addccc %l3,%l7,%g4
+!-
+ srlx %l3,32,%o3
+ ldx [%i1+24],%l3
+ add %i1,32,%i1
+!-
+ srlx %l7,32,%o7
+ ldx [%i2+24],%l7
+ add %i2,32,%i2
+!-
+ addccc %o3,%o7,%g0
+!-
+ stx %g1,[%i0+0]
+!-
+ stx %g2,[%i0+8]
+!-
+ stx %g3,[%i0+16]
+ add %i3,-4,%i3
+!-
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+ brgez,pt %i3,L(loop1)
+ nop
+!--------- End main loop ---------
+L(skip):
+ addccc %l0,%l4,%g1
+ srlx %l0,32,%o0
+ srlx %l4,32,%o4
+ addccc %o0,%o4,%g0
+ addccc %l1,%l5,%g2
+ srlx %l1,32,%o1
+ srlx %l5,32,%o5
+ addccc %o1,%o5,%g0
+ addccc %l2,%l6,%g3
+ srlx %l2,32,%o2
+ srlx %l6,32,%g5 ! asymmetry
+ addccc %o2,%g5,%g0
+ addccc %l3,%l7,%g4
+ srlx %l3,32,%o3
+ srlx %l7,32,%o7
+ addccc %o3,%o7,%g0
+ stx %g1,[%i0+0]
+ stx %g2,[%i0+8]
+ stx %g3,[%i0+16]
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+L(there):
+ add %i3,4,%i3
+ brz,pt %i3,L(end)
+ nop
+
+L(loop2):
+ ldx [%i1+0],%l0
+ add %i1,8,%i1
+ ldx [%i2+0],%l4
+ add %i2,8,%i2
+ srlx %l0,32,%g2
+ srlx %l4,32,%g3
+ addccc %l0,%l4,%g1
+ addccc %g2,%g3,%g0
+ stx %g1,[%i0+0]
+ add %i0,8,%i0
+ add %i3,-1,%i3
+ brgz,pt %i3,L(loop2)
+ nop
+
+L(end): addc %g0,%g0,%i0
+ ret
+ restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` ld [%i0+DHI],%g5')
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ifdef(`LOWPART',
+` add %g5,%g1,%g1') C add *res_ptr to p0 (ADD2)
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` st %g4,[%i0-4+DHI]
+ srlx %g4,32,%g4')
+
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr %o0
+! sptr %o1
+! n %o2
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_copyi)
+ add %o2,-8,%o2
+ brlz,pn %o2,L(skip)
+ nop
+ b,a L(loop1)
+ nop
+
+ ALIGN(16)
+L(loop1):
+ ldx [%o1+0],%g1
+ ldx [%o1+8],%g2
+ ldx [%o1+16],%g3
+ ldx [%o1+24],%g4
+ ldx [%o1+32],%g5
+ ldx [%o1+40],%o3
+ ldx [%o1+48],%o4
+ ldx [%o1+56],%o5
+ add %o1,64,%o1
+ stx %g1,[%o0+0]
+ stx %g2,[%o0+8]
+ stx %g3,[%o0+16]
+ stx %g4,[%o0+24]
+ stx %g5,[%o0+32]
+ stx %o3,[%o0+40]
+ stx %o4,[%o0+48]
+ stx %o5,[%o0+56]
+ add %o2,-8,%o2
+ brgez,pt %o2,L(loop1)
+ add %o0,64,%o0
+
+L(skip):
+ add %o2,8,%o2
+ brz,pt %o2,L(end)
+ nop
+
+L(loop2):
+ ldx [%o1],%g1
+ add %o1,8,%o1
+ add %o2,-1,%o2
+ stx %g1,[%o0]
+ add %o0,8,%o0
+ brgz,pt %o2,L(loop2)
+ nop
+
+L(end): retl
+ nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+ functions instead of open-coding operations in C. */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! src_ptr %o1
+! size %o2
+! cnt %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_lshift)
+ sllx %o2,3,%g1
+ add %o1,%g1,%o1 ! make %o1 point at end of src
+ ldx [%o1-8],%g2 ! load first limb
+ sub %g0,%o3,%o5 ! negate shift count
+ add %o0,%g1,%o0 ! make %o0 point at end of res
+ add %o2,-1,%o2
+ and %o2,4-1,%g4 ! number of limbs in first loop
+ srlx %g2,%o5,%g1 ! compute function result
+ brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop
+ mov %g1,%g5
+
+ sub %o2,%g4,%o2 ! adjust count for main loop
+
+L(loop0):
+ ldx [%o1-16],%g3
+ add %o0,-8,%o0
+ add %o1,-8,%o1
+ add %g4,-1,%g4
+ sllx %g2,%o3,%o4
+ srlx %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ brnz,pt %g4,L(loop0)
+ stx %o4,[%o0+0]
+
+L(0): brz,pn %o2,L(end)
+ nop
+
+L(loop1):
+ ldx [%o1-16],%g3
+ add %o0,-32,%o0
+ add %o2,-4,%o2
+ sllx %g2,%o3,%o4
+ srlx %g3,%o5,%g1
+
+ ldx [%o1-24],%g2
+ sllx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0+24]
+ srlx %g2,%o5,%g1
+
+ ldx [%o1-32],%g3
+ sllx %g2,%o3,%o4
+ or %g4,%g1,%g4
+ stx %g4,[%o0+16]
+ srlx %g3,%o5,%g1
+
+ ldx [%o1-40],%g2
+ sllx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0+8]
+ srlx %g2,%o5,%g1
+
+ add %o1,-32,%o1
+ or %g4,%g1,%g4
+ brnz,pt %o2,L(loop1)
+ stx %g4,[%o0+0]
+
+L(end): sllx %g2,%o3,%g2
+ stx %g2,[%o0-8]
+ retl
+ mov %g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl store the result to a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call mull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call addmulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+mull:
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %g4,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ st %g4,[%i0-4+DHI]
+ srlx %g4,32,%g4
+
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! src_ptr %o1
+! size %o2
+! cnt %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_rshift)
+ ldx [%o1],%g2 ! load first limb
+ sub %g0,%o3,%o5 ! negate shift count
+ add %o2,-1,%o2
+ and %o2,4-1,%g4 ! number of limbs in first loop
+ sllx %g2,%o5,%g1 ! compute function result
+ brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop
+ mov %g1,%g5
+
+ sub %o2,%g4,%o2 ! adjust count for main loop
+
+L(loop0):
+ ldx [%o1+8],%g3
+ add %o0,8,%o0
+ add %o1,8,%o1
+ add %g4,-1,%g4
+ srlx %g2,%o3,%o4
+ sllx %g3,%o5,%g1
+ mov %g3,%g2
+ or %o4,%g1,%o4
+ brnz,pt %g4,L(loop0)
+ stx %o4,[%o0-8]
+
+L(0): brz,pn %o2,L(end)
+ nop
+
+L(loop1):
+ ldx [%o1+8],%g3
+ add %o0,32,%o0
+ add %o2,-4,%o2
+ srlx %g2,%o3,%o4
+ sllx %g3,%o5,%g1
+
+ ldx [%o1+16],%g2
+ srlx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0-32]
+ sllx %g2,%o5,%g1
+
+ ldx [%o1+24],%g3
+ srlx %g2,%o3,%o4
+ or %g4,%g1,%g4
+ stx %g4,[%o0-24]
+ sllx %g3,%o5,%g1
+
+ ldx [%o1+32],%g2
+ srlx %g3,%o3,%g4
+ or %o4,%g1,%o4
+ stx %o4,[%o0-16]
+ sllx %g2,%o5,%g1
+
+ add %o1,32,%o1
+ or %g4,%g1,%g4
+ brnz %o2,L(loop1)
+ stx %g4,[%o0-8]
+
+L(end): srlx %g2,%o3,%g2
+ stx %g2,[%o0-0]
+ retl
+ mov %g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! s1_ptr %o1
+! s2_ptr %o2
+! size %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+ save %sp,-160,%sp
+
+ addcc %g0,%g0,%g0
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(there)
+ nop
+
+ ldx [%i1+0],%l0
+ ldx [%i2+0],%l4
+ ldx [%i1+8],%l1
+ ldx [%i2+8],%l5
+ ldx [%i1+16],%l2
+ ldx [%i2+16],%l6
+ ldx [%i1+24],%l3
+ ldx [%i2+24],%l7
+ add %i1,32,%i1
+ add %i2,32,%i2
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(skip)
+ nop
+ b L(loop1) ! jump instead of executing many NOPs
+ nop
+ ALIGN(32)
+!--------- Start main loop ---------
+L(loop1):
+ subccc %l0,%l4,%g1
+!-
+ srlx %l0,32,%o0
+ ldx [%i1+0],%l0
+!-
+ srlx %l4,32,%o4
+ ldx [%i2+0],%l4
+!-
+ subccc %o0,%o4,%g0
+!-
+ subccc %l1,%l5,%g2
+!-
+ srlx %l1,32,%o1
+ ldx [%i1+8],%l1
+!-
+ srlx %l5,32,%o5
+ ldx [%i2+8],%l5
+!-
+ subccc %o1,%o5,%g0
+!-
+ subccc %l2,%l6,%g3
+!-
+ srlx %l2,32,%o2
+ ldx [%i1+16],%l2
+!-
+ srlx %l6,32,%g5 ! asymmetry
+ ldx [%i2+16],%l6
+!-
+ subccc %o2,%g5,%g0
+!-
+ subccc %l3,%l7,%g4
+!-
+ srlx %l3,32,%o3
+ ldx [%i1+24],%l3
+ add %i1,32,%i1
+!-
+ srlx %l7,32,%o7
+ ldx [%i2+24],%l7
+ add %i2,32,%i2
+!-
+ subccc %o3,%o7,%g0
+!-
+ stx %g1,[%i0+0]
+!-
+ stx %g2,[%i0+8]
+!-
+ stx %g3,[%i0+16]
+ add %i3,-4,%i3
+!-
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+ brgez,pt %i3,L(loop1)
+ nop
+!--------- End main loop ---------
+L(skip):
+ subccc %l0,%l4,%g1
+ srlx %l0,32,%o0
+ srlx %l4,32,%o4
+ subccc %o0,%o4,%g0
+ subccc %l1,%l5,%g2
+ srlx %l1,32,%o1
+ srlx %l5,32,%o5
+ subccc %o1,%o5,%g0
+ subccc %l2,%l6,%g3
+ srlx %l2,32,%o2
+ srlx %l6,32,%g5 ! asymmetry
+ subccc %o2,%g5,%g0
+ subccc %l3,%l7,%g4
+ srlx %l3,32,%o3
+ srlx %l7,32,%o7
+ subccc %o3,%o7,%g0
+ stx %g1,[%i0+0]
+ stx %g2,[%i0+8]
+ stx %g3,[%i0+16]
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+L(there):
+ add %i3,4,%i3
+ brz,pt %i3,L(end)
+ nop
+
+L(loop2):
+ ldx [%i1+0],%l0
+ add %i1,8,%i1
+ ldx [%i2+0],%l4
+ add %i2,8,%i2
+ srlx %l0,32,%g2
+ srlx %l4,32,%g3
+ subccc %l0,%l4,%g1
+ subccc %g2,%g3,%g0
+ stx %g1,[%i0+0]
+ add %i0,8,%i0
+ add %i3,-1,%i3
+ brgz,pt %i3,L(loop2)
+ nop
+
+L(end): addc %g0,%g0,%i0
+ ret
+ restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+ save %sp,-256,%sp
+
+ sethi %hi(0xffff0000),%o0
+ andn %i3,%o0,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f6
+
+ srl %i3,16,%o0
+ st %o0,[%fp-17]
+ ld [%fp-17],%f11
+ fxtod %f10,%f8
+
+ mov 0,%g3 C cy = 0
+
+ ld [%i1+4],%f11
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end1)
+ add %i1,4,%i1 C s1_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+ be,pn %icc,E(end2)
+ std %f12,[%fp-17]
+
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ subcc %i2,1,%i2
+dnl be,pn %icc,E(end3)
+ std %f12,[%fp-33]
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ add %i0,4,%i0 C res_ptr++
+ subcc %i2,1,%i2
+ be,pn %icc,E(end4)
+ std %f12,[%fp-17]
+
+ b,a E(loop)
+ nop C nop is cheap to nullify
+
+ ALIGN(16)
+C BEGIN LOOP
+E(loop):
+ fxtod %f10,%f2
+ ld [%i1+4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ sub %i2,2,%i2
+ add %i0,4,%i0 C res_ptr++
+
+ fxtod %f10,%f2
+ ld [%i1-4],%f11
+ add %i1,4,%i1 C s1_ptr++
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-17],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DHI]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-25]
+ fdtox %f4,%f12
+ std %f12,[%fp-17]
+ brnz,pt %i2,E(loop)
+ add %i0,4,%i0 C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+ fxtod %f10,%f2
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DHI],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ fmuld %f2,%f8,%f16
+ ldx [%fp-33],%g1 C p0
+ fmuld %f2,%f6,%f4
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ fdtox %f16,%f14
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ ld [%i0+DLO],%g5
+ srlx %g4,32,%g3
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DHI]
+ b,a E(yyy)
+
+E(end2):
+ fxtod %f10,%f2
+ fmuld %f2,%f8,%f16
+ fmuld %f2,%f6,%f4
+ fdtox %f16,%f14
+ std %f14,[%fp-41]
+ fdtox %f4,%f12
+ std %f12,[%fp-33]
+ ld [%i0+DLO],%g5
+ ldx [%fp-25],%g2 C p16
+ ldx [%fp-17],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+` ld [%i0+DHI],%g5')
+ srlx %g4,32,%g3
+ ldx [%fp-41],%g2 C p16
+ ldx [%fp-33],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ st %l2,[%i0-4+DLO]
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i0,4,%i0 C res_ptr++
+
+ add %g3,%g1,%g4 C p += cy
+ifdef(`LOWPART',
+` subxcc %g5,%g4,%l2') C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+` st %l2,[%i0-4+DHI]
+ srlx %g4,32,%g4')
+
+ addx %g4,0,%g4
+ ret
+ restore %g0,%g4,%o0 C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr i0
+C s1_ptr i1
+C size i2
+C s2_limb i3
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+ save %sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below. Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc): rd %pc,%o7
+ ld [%o7+L(noll)-L(pc)],%f10',
+` sethi %hh(L(noll)),%g2
+ sethi %lm(L(noll)),%g1
+ or %g2,%hm(L(noll)),%g2
+ or %g1,%lo(L(noll)),%g1
+ sllx %g2,32,%g2
+ ld [%g1+%g2],%f10')
+
+ sub %i1,%i0,%g1
+ srlx %g1,3,%g1
+ cmp %g1,%i2
+ bcc,pt %xcc,L(nooverlap)
+ nop
+
+ sllx %i2,3,%g2 C compute stack allocation byte count
+ add %g2,15,%o0
+ and %o0,-16,%o0
+ sub %sp,%o0,%sp
+ add %sp,2223,%o0
+
+ mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp
+ call mpn_copyi
+ mov %i2,%o2 C copy n to mpn_copyi's count parameter
+
+ add %sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+ mov %i0,%o0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call submull
+ srl %i3,0,%o3
+
+ mov %o0,%l0 C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+ srlx %i3,32,%o3
+ brz,a,pn %o3,L(small)
+ mov %o0,%i0
+ mov %i1,%o1
+ add %i2,%i2,%o2
+ call submulu
+ add %i0,4,%o0
+
+ add %l0,%o0,%i0
+L(small):
+ ret
+ restore %g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+ TEXT
+ ALIGN(4)
+L(noll):
+ .word 0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')
diff --git a/rts/gmp/mpn/thumb/add_n.s b/rts/gmp/mpn/thumb/add_n.s
new file mode 100644
index 0000000000..c1eeb6ca87
--- /dev/null
+++ b/rts/gmp/mpn/thumb/add_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store
+@ sum in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr r0
+@ S1_ptr r1
+@ S2_ptr r2
+@ SIZE r3
+
+@ NOT TESTED CODE
+
+ .text
+ .thumb
+ .align 0
+ .global ___gmpn_add_n
+___gmpn_add_n:
+ push {r4, r5, r6, lr}
+ mov r6, #1 @ init carry save register
+
+Loop: sub r6, #1 @ restore carry (set iff r6 was 0)
+ ldmia r1!, {r4} @ load next limb from S1
+ ldmia r2!, {r5} @ load next limb from S2
+ adc r4, r5
+ stmia r0!, {r4} @ store result limb to RES
+ sbc r6, r6 @ save negated carry
+ sub r3, #1
+ bge Loop @ loop back while remaining count >= 4
+
+ mov r0, r6
+ pop {r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/thumb/sub_n.s b/rts/gmp/mpn/thumb/sub_n.s
new file mode 100644
index 0000000000..53c292375f
--- /dev/null
+++ b/rts/gmp/mpn/thumb/sub_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+@ store difference in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr r0
+@ S1_ptr r1
+@ S2_ptr r2
+@ SIZE r3
+
+@ NOT TESTED CODE
+
+ .text
+ .thumb
+ .align 0
+ .global ___gmpn_sub_n
+___gmpn_sub_n:
+ push {r4, r5, r6, lr}
+ mov r6, #1 @ init carry save register
+
+Loop: sub r6, #1 @ restore carry (set iff r6 was 0)
+ ldmia r1!, {r4} @ load next limb from S1
+ ldmia r2!, {r5} @ load next limb from S2
+ sbc r4, r5
+ stmia r0!, {r4} @ store result limb to RES
+ sbc r6, r6 @ save negated carry
+ sub r3, #1
+ bge Loop @ loop back while remaining count >= 4
+
+ mov r0, r6
+ pop {r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/underscore.h b/rts/gmp/mpn/underscore.h
new file mode 100644
index 0000000000..240dae0f63
--- /dev/null
+++ b/rts/gmp/mpn/underscore.h
@@ -0,0 +1,26 @@
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#if __STDC__
+#define C_SYMBOL_NAME(name) _##name
+#else
+#define C_SYMBOL_NAME(name) _/**/name
+#endif
diff --git a/rts/gmp/mpn/vax/add_n.s b/rts/gmp/mpn/vax/add_n.s
new file mode 100644
index 0000000000..cf4060f521
--- /dev/null
+++ b/rts/gmp/mpn/vax/add_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# s2_ptr (sp + 12)
+# size (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ .word 0x0
+ movl 16(ap),r0
+ movl 12(ap),r1
+ movl 8(ap),r2
+ movl 4(ap),r3
+ mnegl r0,r5
+ addl2 $3,r0
+ ashl $-2,r0,r0 # unroll loop count
+ bicl2 $-4,r5 # mask out low 2 bits
+ movaq (r5)[r5],r5 # 9x
+ jmp Loop(r5)
+
+Loop: movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ sobgtr r0,Loop
+
+ adwc r0,r0
+ ret
diff --git a/rts/gmp/mpn/vax/addmul_1.s b/rts/gmp/mpn/vax/addmul_1.s
new file mode 100644
index 0000000000..379061dcb7
--- /dev/null
+++ b/rts/gmp/mpn/vax/addmul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ addl2 r2,(r9)+
+ adwc r1,r3
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ addl2 r10,(r9)+
+ adwc r1,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/rts/gmp/mpn/vax/lshift.s b/rts/gmp/mpn/vax/lshift.s
new file mode 100644
index 0000000000..fd311a9782
--- /dev/null
+++ b/rts/gmp/mpn/vax/lshift.s
@@ -0,0 +1,58 @@
+# VAX __gmpn_lshift -- left shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr (sp + 4)
+# sptr (sp + 8)
+# size (sp + 12)
+# cnt (sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers
+
+.text
+ .align 1
+.globl ___gmpn_lshift
+___gmpn_lshift:
+ .word 0x1c0
+ movl 4(ap),r7
+ movl 8(ap),r6
+ movl 12(ap),r1
+ movl 16(ap),r8
+
+ moval (r6)[r1],r6
+ moval (r7)[r1],r7
+ clrl r3
+ movl -(r6),r2
+ ashq r8,r2,r4
+ movl r5,r0
+ movl r2,r3
+ decl r1
+ jeql Lend
+
+Loop: movl -(r6),r2
+ ashq r8,r2,r4
+ movl r5,-(r7)
+ movl r2,r3
+ jsobgtr r1,Loop
+
+Lend: movl r4,-4(r7)
+ ret
diff --git a/rts/gmp/mpn/vax/mul_1.s b/rts/gmp/mpn/vax/mul_1.s
new file mode 100644
index 0000000000..708e8ca6ca
--- /dev/null
+++ b/rts/gmp/mpn/vax/mul_1.s
@@ -0,0 +1,123 @@
+# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests. (VAX 3600)
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ movl r2,(r9)+
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ movl r2,(r9)+
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ movl r2,(r9)+
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r1,r3
+ addl2 r11,r2
+ adwc r6,r3
+ movl r2,(r9)+
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r1,r11
+ addl2 r3,r10
+ adwc r6,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/rts/gmp/mpn/vax/rshift.s b/rts/gmp/mpn/vax/rshift.s
new file mode 100644
index 0000000000..515813208d
--- /dev/null
+++ b/rts/gmp/mpn/vax/rshift.s
@@ -0,0 +1,56 @@
+# VAX __gmpn_rshift -- right shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr (sp + 4)
+# sptr (sp + 8)
+# size (sp + 12)
+# cnt (sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers
+
+.text
+ .align 1
+.globl ___gmpn_rshift
+___gmpn_rshift:
+ .word 0x1c0
+ movl 4(ap),r7
+ movl 8(ap),r6
+ movl 12(ap),r1
+ movl 16(ap),r8
+
+ movl (r6)+,r2
+ subl3 r8,$32,r8
+ ashl r8,r2,r0
+ decl r1
+ jeql Lend
+
+Loop: movl (r6)+,r3
+ ashq r8,r2,r4
+ movl r5,(r7)+
+ movl r3,r2
+ jsobgtr r1,Loop
+
+Lend: clrl r3
+ ashq r8,r2,r4
+ movl r5,(r7)
+ ret
diff --git a/rts/gmp/mpn/vax/sub_n.s b/rts/gmp/mpn/vax/sub_n.s
new file mode 100644
index 0000000000..eff4b1c044
--- /dev/null
+++ b/rts/gmp/mpn/vax/sub_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# s2_ptr (sp + 12)
+# size (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ .word 0x0
+ movl 16(ap),r0
+ movl 12(ap),r1
+ movl 8(ap),r2
+ movl 4(ap),r3
+ mnegl r0,r5
+ addl2 $3,r0
+ ashl $-2,r0,r0 # unroll loop count
+ bicl2 $-4,r5 # mask out low 2 bits
+ movaq (r5)[r5],r5 # 9x
+ jmp Loop(r5)
+
+Loop: movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ sobgtr r0,Loop
+
+ adwc r0,r0
+ ret
diff --git a/rts/gmp/mpn/vax/submul_1.s b/rts/gmp/mpn/vax/submul_1.s
new file mode 100644
index 0000000000..be42286935
--- /dev/null
+++ b/rts/gmp/mpn/vax/submul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ subl2 r2,(r9)+
+ adwc r1,r3
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ subl2 r10,(r9)+
+ adwc r1,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README
new file mode 100644
index 0000000000..3507548b8c
--- /dev/null
+++ b/rts/gmp/mpn/x86/README
@@ -0,0 +1,40 @@
+
+ X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+ x86 i386, i486, generic
+ x86/pentium Intel Pentium (P5, P54)
+ x86/pentium/mmx Intel Pentium with MMX (P55)
+ x86/p6 Intel Pentium Pro
+ x86/p6/mmx Intel Pentium II, III
+ x86/p6/p3mmx Intel Pentium III
+ x86/k6 AMD K6, K6-2, K6-3
+ x86/k6/mmx
+ x86/k6/k62mmx AMD K6-2
+ x86/k7 AMD Athlon
+ x86/k7/mmx
+
+
+The x86 directory is also the main support for P6 at the moment, and
+is something of a blended style, meant to be reasonable on all x86s.
+
+
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but not so well
+optimized for Cyrix chips.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+For implementations with slow double shift instructions (SHLD and
+SHRD), it might be better to mimic their operation with SHL+SHR+OR.
+(M2 is likely to benefit from that, but not Pentium due to its slow
+plain SHL and SHR.)
diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family
new file mode 100644
index 0000000000..3bc73f58b0
--- /dev/null
+++ b/rts/gmp/mpn/x86/README.family
@@ -0,0 +1,333 @@
+
+ X86 CPU FAMILY MPN SUBROUTINES
+
+
+This file has some notes on things common to all the x86 family code.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style x86 assembler code, first put through m4
+for macro processing. The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4. Detailed notes are in those files.
+
+The code is meant for use with GNU "gas" or a system "as". There's no
+support for assemblers that demand Intel style, and with gas freely
+available and easy to use that shouldn't be a problem.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too. For example,
+mpn_mul_1() has the following.
+
+ defframe(PARAM_MULTIPLIER, 16)
+ defframe(PARAM_SIZE, 12)
+ defframe(PARAM_SRC, 8)
+ defframe(PARAM_DST, 4)
+
+Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others
+similarly. The return address is at offset 0, but there's not normally any
+need to access that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct. At the start of a routine FRAME should be zero.
+
+ deflit(`FRAME',0)
+ ...
+ deflit(`FRAME',4)
+ ...
+ deflit(`FRAME',8)
+ ...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME. If new values aren't
+going to get used, then the redefinitions are omitted to keep from
+cluttering up the code. This happens for instance at the end of a routine,
+where there might be just four register pops and then a ret, so FRAME isn't
+getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer. For
+example,
+
+ defframe(SAVE_ESI, -4)
+ defframe(SAVE_EDI, -8)
+ defframe(VAR_COUNTER,-12)
+
+ deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used. If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept. If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+ addl $32/2, %eax <-- wrong
+
+ addl $eval(32/2), %eax <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select
+the latter, and as of 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead,
+unfortunately. For that reason "C" commenting is used (see asm-defs.m4) and
+the intermediate ".s" files have no comments.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement. These are
+either for computed jumps or to get desirable code alignment. Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx). The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it. In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers. gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test is used to determine whether %cl should be used and
+the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass
+through or omit %cl as necessary. See comments with those macros for usage.
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit. (See iBCS2 and SVR4 ABI books, references below.)
+
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected. Since it's better to be safe than
+sorry, gmp follows glibc and does a "cld" if it depends on the direction
+flag being clear. This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+Defining the symbol PIC in m4 processing selects position independent code.
+This mainly affects computed jumps, and these are implemented in a
+self-contained fashion (without using the global offset table). The few
+calls from assembly code to global functions use the normal procedure
+linkage table.
+
+PIC is necessary for ELF shared libraries because they can be mapped into
+different processes at different virtual addresses. Text relocations in
+shared libraries are allowed, but that presumably means a page with such a
+relocation isn't shared. The use of the PLT for PIC adds a fixed cost to
+every function call, which is small but might be noticeable when working with
+small operands.
+
+Calls from one library function to another don't need to go through the PLT,
+since of course the call instruction uses a displacement, not an absolute
+address, and the relative locations of object files are known when libgmp.so
+is created. "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls
+this way, so that there's no jump through the PLT, but of course leaving
+setups of the GOT address in %ebx that may be unnecessary.
+
+The %ebx setup could be avoided in assembly if a separate option controlled
+PIC for calls as opposed to computed jumps etc. But there's only ever
+likely to be a handful of calls out of assembler, and getting the same
+optimization for C intra-library calls would be more important. There seems
+no easy way to tell gcc that certain functions can be called non-PIC, and
+unfortunately many gmp functions use the global memory allocation variables,
+so they need the GOT anyway. Object files with no global data references
+and only intra-library calls could go into the library as non-PIC under
+-Bsymbolic. Integrating this into libtool and automake is left as an
+exercise for the reader.
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster. Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes. To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size. The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code. The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop. If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC. Something like the following is typical.
+
+ ifdef(`PIC',`
+ deflit(UNROLL_THRESHOLD, 10)
+ ',`
+ deflit(UNROLL_THRESHOLD, 8)
+ ')
+
+There's no automated way to determine the threshold. Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined. Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop. Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop. When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+ S = operand size in limbs
+ N = number of limbs per loop (UNROLL_COUNT)
+ L = log2 of unrolling (UNROLL_LOG2)
+ M = mask for unrolling (UNROLL_MASK)
+ C = code bytes per limb in the loop
+ B = bytes per limb (4 for x86)
+
+ computed jump (-S & M) * C + entrypoint
+ subtract from pointers (-S & M) * B
+ initial loop counter (S-1) >> L
+ displacements 0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1. The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+ add to pointers (-S & M) * B
+ displacements 0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect gmp code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero. For example,
+
+ addl $bar-foo, %eax
+ foo:
+ nop
+ bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+ foo:
+ addl $bar-foo, %eax
+ nop
+ bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal. For example,
+
+ foo:
+ nop
+ bar:
+ leal bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand". When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+ leal foo(%eax,%ebx,8), %ecx
+ nop
+ foo:
+
+These problems only affect PIC computed jump calculations. The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999,
+order numbers 243190, 243191 and 243192. Available on-line,
+
+ ftp://download.intel.com/design/PentiumII/manuals/243190.htm
+ ftp://download.intel.com/design/PentiumII/manuals/243191.htm
+ ftp://download.intel.com/design/PentiumII/manuals/243192.htm
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X. (These have details of ELF
+shared library PIC coding.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S
new file mode 100644
index 0000000000..fe6f648f53
--- /dev/null
+++ b/rts/gmp/mpn/x86/addsub_n.S
@@ -0,0 +1,174 @@
+/* Currently not working and not used. */
+
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#define SAVE_BORROW_RESTORE_CARRY(r) adcl r,r; shll $31,r
+#define SAVE_CARRY_RESTORE_BORROW(r) adcl r,r
+
+ .globl mpn_addsub_n_0
+ .globl mpn_addsub_n_1
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1.
+ We let subtraction and addition alternate in being two limbs
+ ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r2 + r1 edi = esi + edi
+// r2 = r2 - r1 esi = esi - edi
+// s1 s2
+// r2 r1
+// eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_0:
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+
+ movl 20(%esp),%edi /* res_ptr */
+ movl 24(%esp),%esi /* s1_ptr */
+ movl 36(%esp),%ebp /* size */
+
+ shrl $2,%ebp
+ xorl %edx,%edx
+ .align 4
+Loop0: // L=load E=execute S=store
+ movl (%esi),%ebx // sub 0 L
+ movl 4(%esi),%ecx // sub 1 L
+ sbbl (%edi),%ebx // sub 0 LE
+ sbbl 4(%edi),%ecx // sub 1 LE
+// SAVE_BORROW_RESTORE_CARRY(%edx)
+ movl (%esi),%eax // add 0 L
+ adcl %eax,(%edi) // add 0 LES
+ movl 4(%esi),%eax // add 1 L
+ adcl %eax,4(%edi) // add 1 LES
+ movl %ebx,(%esi) // sub 0 S
+ movl %ecx,4(%esi) // sub 1 S
+ movl 8(%esi),%ebx // add 2 L
+ adcl 8(%edi),%ebx // add 2 LE
+ movl 12(%esi),%ecx // add 3 L
+ adcl 12(%edi),%ecx // add 3 LE
+// SAVE_CARRY_RESTORE_BORROW(%edx)
+ movl 8(%edi),%eax // sub 2 L
+ sbbl %eax,8(%esi) // sub 2 LES
+ movl 12(%edi),%eax // sub 3 L
+ sbbl %eax,12(%esi) // sub 3 LES
+ movl %ebx,8(%edi) // add 2 S
+ movl %ecx,12(%edi) // add 3 S
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ebp
+ jnz Loop0
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2.
+ We let subtraction and addition alternate in being two limbs
+ ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r1 + r2 edi = edi + esi
+// r2 = r1 - r2 esi = edi - esi
+// s2 s1
+// r2 r1
+// eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_1:
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+
+ movl 20(%esp),%edi /* res_ptr */
+ movl 24(%esp),%esi /* s1_ptr */
+ movl 36(%esp),%ebp /* size */
+
+ shrl $2,%ebp
+ xorl %edx,%edx
+ .align 4
+Loop1: // L=load E=execute S=store
+ movl (%edi),%ebx // sub 0 L
+ sbbl (%esi),%ebx // sub 0 LE
+ movl 4(%edi),%ecx // sub 1 L
+ sbbl 4(%esi),%ecx // sub 1 LE
+// SAVE_BORROW_RESTORE_CARRY(%edx)
+ movl (%esi),%eax // add 0 L
+ adcl %eax,(%edi) // add 0 LES
+ movl 4(%esi),%eax // add 1 L
+ adcl %eax,4(%edi) // add 1 LES
+ movl %ebx,(%esi) // sub 0 S
+ movl %ecx,4(%esi) // sub 1 S
+ movl 8(%esi),%ebx // add 2 L
+ adcl 8(%edi),%ebx // add 2 LE
+ movl 12(%esi),%ecx // add 3 L
+ adcl 12(%edi),%ecx // add 3 LE
+// SAVE_CARRY_RESTORE_BORROW(%edx)
+ movl 8(%edi),%eax // sub 2 L
+ sbbl 8(%esi),%eax // sub 2 LES
+ movl %eax,8(%esi) // sub 2 S
+ movl 12(%edi),%eax // sub 3 L
+ sbbl 12(%esi),%eax // sub 3 LE
+ movl %eax,12(%esi) // sub 3 S
+ movl %ebx,8(%edi) // add 2 S
+ movl %ecx,12(%edi) // add 3 S
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ebp
+ jnz Loop1
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+ .globl mpn_copy
+mpn_copy:
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+
+ movl 20(%esp),%edi /* res_ptr */
+ movl 24(%esp),%esi /* s1_ptr */
+ movl 28(%esp),%ebp /* size */
+
+ shrl $2,%ebp
+ .align 4
+Loop2:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl 8(%esi),%eax
+ movl 12(%esi),%ebx
+ movl %eax,8(%edi)
+ movl %ebx,12(%edi)
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ebp
+ jnz Loop2
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000000..18ef816b4d
--- /dev/null
+++ b/rts/gmp/mpn/x86/aors_n.asm
@@ -0,0 +1,187 @@
+dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%edx
+ movl PARAM_SIZE,%ecx
+
+ movl %ecx,%eax
+ shrl $3,%ecx C compute count for unrolled loop
+ negl %eax
+ andl $7,%eax C get index where to start loop
+ jz LF(M4_function_n,oopgo) C necessary special case for 0
+ incl %ecx C adjust loop count
+ shll $2,%eax C adjustment for pointers...
+ subl %eax,%edi C ... since they are offset ...
+ subl %eax,%esi C ... by a constant when we ...
+ subl %eax,%edx C ... enter the loop
+ shrl $2,%eax C restore previous value
+
+ifdef(`PIC',`
+ C Calculate start address in loop for PIC. Due to limitations in
+ C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+ call L(0a)
+L(0a): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $LF(M4_function_n,oop)-L(0a)-3,%eax
+ addl $4,%esp
+',`
+ C Calculate start address in loop for non-PIC.
+ leal LF(M4_function_n,oop)-3(%eax,%eax,8),%eax
+')
+
+ C These lines initialize carry from the 5th parameter. Should be
+ C possible to simplify.
+ pushl %ebp FRAME_pushl()
+ movl PARAM_CARRY,%ebp
+ shrl $1,%ebp C shift bit 0 into carry
+ popl %ebp FRAME_popl()
+
+ jmp *%eax C jump into loop
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%edx
+ movl PARAM_SIZE,%ecx
+
+ movl %ecx,%eax
+ shrl $3,%ecx C compute count for unrolled loop
+ negl %eax
+ andl $7,%eax C get index where to start loop
+ jz L(oop) C necessary special case for 0
+ incl %ecx C adjust loop count
+ shll $2,%eax C adjustment for pointers...
+ subl %eax,%edi C ... since they are offset ...
+ subl %eax,%esi C ... by a constant when we ...
+ subl %eax,%edx C ... enter the loop
+ shrl $2,%eax C restore previous value
+
+ifdef(`PIC',`
+ C Calculate start address in loop for PIC. Due to limitations in
+ C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+ call L(0b)
+L(0b): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $L(oop)-L(0b)-3,%eax
+ addl $4,%esp
+',`
+ C Calculate start address in loop for non-PIC.
+ leal L(oop)-3(%eax,%eax,8),%eax
+')
+ jmp *%eax C jump into loop
+
+L(oopgo):
+ pushl %ebp FRAME_pushl()
+ movl PARAM_CARRY,%ebp
+ shrl $1,%ebp C shift bit 0 into carry
+ popl %ebp FRAME_popl()
+
+ ALIGN(8)
+L(oop): movl (%esi),%eax
+ M4_inst (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ M4_inst 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ M4_inst 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ M4_inst 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ M4_inst 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ M4_inst 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ M4_inst 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ M4_inst 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000000..f32ad83989
--- /dev/null
+++ b/rts/gmp/mpn/x86/aorsmul_1.asm
@@ -0,0 +1,134 @@
+dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl limb and add the result to a second limb vector.
+
+
+dnl Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE, `FRAME+12(%esp)')
+define(PARAM_SRC, `FRAME+8(%esp)')
+define(PARAM_DST, `FRAME+4(%esp)')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ecx
+
+ xorl %ebx,%ebx
+ andl $3,%ecx
+ jz L(end0)
+
+L(oop0):
+ movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ leal 4(%esi),%esi
+ addl %ebx,%eax
+ movl $0,%ebx
+ adcl %ebx,%edx
+ M4_inst %eax,(%edi)
+ adcl %edx,%ebx C propagate carry into cylimb
+
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oop0)
+
+L(end0):
+ movl PARAM_SIZE,%ecx
+ shrl $2,%ecx
+ jz L(end)
+
+ ALIGN(8)
+L(oop): movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ addl %eax,%ebx
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 4(%esi),%eax
+ mull PARAM_MULTIPLIER
+ M4_inst %ebx,(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl 8(%esi),%eax
+ mull PARAM_MULTIPLIER
+ M4_inst %ebp,4(%edi)
+ adcl %eax,%ebx C new lo + cylimb
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 12(%esi),%eax
+ mull PARAM_MULTIPLIER
+ M4_inst %ebx,8(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ M4_inst %ebp,12(%edi)
+ adcl $0,%ebx C propagate carry into cylimb
+
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ecx
+ jnz L(oop)
+
+L(end): movl %ebx,%eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000000..439640e836
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyd.asm
@@ -0,0 +1,80 @@
+dnl x86 mpn_copyd -- copy limb vector, decrementing.
+dnl
+dnl Future: On P6 an MMX loop should be able to go faster than this code.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 2.4 cycles/limb, approx 40 cycles startup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+ C eax saved esi
+ C ebx
+ C ecx counter
+ C edx saved edi
+ C esi src
+ C edi dst
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ movl PARAM_DST, %edi
+ leal -4(%esi,%ecx,4), %esi
+
+ leal -4(%edi,%ecx,4), %edi
+
+ std
+
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000000..5bc4e36689
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyi.asm
@@ -0,0 +1,79 @@
+dnl x86 mpn_copyi -- copy limb vector, incrementing.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 0.75 cycles/limb. An MMX based copy was tried, but was found to be
+C slower than a rep movs in all cases. The fastest MMX found was 0.8
+C cycles/limb (when fully aligned). A rep movs seems to have a startup
+C time of about 15 cycles, but doing something special for small sizes
+C could lead to a branch misprediction that would destroy any saving.
+C For now a plain rep movs seems ok for P6.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+ C eax saved esi
+ C ebx
+ C ecx counter
+ C edx saved edi
+ C esi src
+ C edi dst
+ C ebp
+
+PROLOGUE(mpn_copyi)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ movl PARAM_DST, %edi
+
+ cld C better safe than sorry, see mpn/x86/README.family
+
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm
new file mode 100644
index 0000000000..df879da9e1
--- /dev/null
+++ b/rts/gmp/mpn/x86/diveby3.asm
@@ -0,0 +1,115 @@
+dnl x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl The following all have their own optimized versions of this routine,
+dnl but for reference the code here runs as follows.
+dnl
+dnl cycles/limb
+dnl P54 18.0
+dnl P55 17.0
+dnl P6 14.5
+dnl K6 14.0
+dnl K7 10.0
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+dnl ceil(b/3) and ceil(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL, 0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %ecx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ebp
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ pushl %esi FRAME_pushl()
+
+ movl $INVERSE_3, %esi
+ pushl %ebx FRAME_pushl()
+
+ leal (%ecx,%ebp,4), %ecx
+ movl PARAM_CARRY, %ebx
+
+ leal (%edi,%ebp,4), %edi
+ negl %ebp
+
+
+ ALIGN(8)
+L(top):
+ C eax scratch, low product
+ C ebx carry limb (0 to 3)
+ C ecx &src[size]
+ C edx scratch, high product
+ C esi multiplier
+ C edi &dst[size]
+ C ebp counter, limbs, negative
+
+ movl (%ecx,%ebp,4), %eax
+
+ subl %ebx, %eax
+
+ setc %bl
+
+ imull %esi
+
+ cmpl $ONE_THIRD_CEIL, %eax
+ movl %eax, (%edi,%ebp,4)
+
+ sbbl $-1, %ebx C +1 if eax>=ceil(b/3)
+ cmpl $TWO_THIRDS_CEIL, %eax
+
+ sbbl $-1, %ebx C +1 if eax>=ceil(b*2/3)
+ incl %ebp
+
+ jnz L(top)
+
+
+ movl %ebx, %eax
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ebp
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000000..12f14676d6
--- /dev/null
+++ b/rts/gmp/mpn/x86/divrem_1.asm
@@ -0,0 +1,232 @@
+dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl cycles/limb
+dnl K6 20
+dnl P5 44
+dnl P6 39
+dnl 486 approx 43 maybe
+dnl
+dnl
+dnl The following have their own optimized divrem_1 implementations, but
+dnl for reference the code here runs as follows.
+dnl
+dnl cycles/limb
+dnl P6MMX 39
+dnl K7 42
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize. Return the
+C remainder. Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size. Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the following advantages.
+C
+C - If gcc isn't being used then divrem_1.c will get the generic C
+C udiv_qrnnd() and be rather slow.
+C
+C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
+C generate that instruction (as of gcc 2.95.2 at least).
+C
+C A test is done to see if the high limb is less the the divisor, and if so
+C one less div is done. A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount. The branch misprediction penalty on each chip is less
+C than half a div.
+C
+C
+C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
+C here, so it seems there's nothing to gain by rearranging the loop.
+C Pairing the mov and loop instructions was found to gain nothing. (The
+C same is true of the mpn/x86/mod_1.asm loop.)
+C
+C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
+C The loop_or_decljnz macro is an easy way to get a 10% speedup.
+C
+C The fast K6 multiply might be thought to suit a multiply-by-inverse,
+C but that algorithm has been found to suffer from the releatively poor
+C carry handling on K6 and too many auxiliary instructions. The
+C fractional part however could be done at about 13 c/l.
+C
+C P5: Moving the load down to pair with the store might save 1 cycle, but
+C that doesn't seem worth bothering with, since it'd be only a 2.2%
+C saving.
+C
+C Again here the auxiliary instructions hinder a multiply-by-inverse,
+C though there might be a 10-15% speedup available
+
+
+defframe(PARAM_CARRY, 24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ orl %ecx, %ecx
+
+ movl PARAM_CARRY, %edx
+ jz LF(mpn_divrem_1,fraction)
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jmp LF(mpn_divrem_1,integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx,%ecx
+
+ jz L(size_zero)
+ pushl %ebx FRAME_pushl()
+
+ movl -4(%edi,%ecx,4), %eax C src high limb
+ xorl %edx, %edx
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ cmpl %esi, %eax
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jae L(integer_entry)
+
+
+ C high<divisor, so high of dst is zero, and avoid one div
+
+ movl %edx, (%ebx,%ecx,4)
+ decl %ecx
+
+ movl %eax, %edx
+ jz L(fraction)
+
+
+L(integer_top):
+ C eax scratch (quotient)
+ C ebx dst+4*xsize-4
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi src
+ C ebp xsize
+
+ movl -4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+ divl %esi
+
+ movl %eax, (%ebx,%ecx,4)
+ loop_or_decljnz L(integer_top)
+
+
+L(fraction):
+ orl %ebp, %ecx
+ jz L(done)
+
+ movl PARAM_DST, %ebx
+
+
+L(fraction_top):
+ C eax scratch (quotient)
+ C ebx dst
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi
+ C ebp
+
+ xorl %eax, %eax
+
+ divl %esi
+
+ movl %eax, -4(%ebx,%ecx,4)
+ loop_or_decljnz L(fraction_top)
+
+
+L(done):
+ popl %ebp
+ movl %edx, %eax
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+ movl PARAM_XSIZE, %ecx
+ xorl %eax, %eax
+
+ movl PARAM_DST, %edi
+
+ cld C better safe than sorry, see mpn/x86/README.family
+
+ rep
+ stosl
+
+ popl %esi
+ popl %edi
+ ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000000..3ad96c8b89
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/README
@@ -0,0 +1,237 @@
+
+ AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx and k62mmx subdirectories have routines using MMX instructions. All
+K6s have MMX, the separate directories are just so that ./configure can omit
+them if the assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.25 normal, 2.75 in-place
+
+ mpn_mul_1 6.25
+ mpn_add/submul_1 7.65-8.4 (varying with data values)
+
+ mpn_mul_basecase 9.25 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.7 cycles/crossproduct (approx)
+ or 9.2 cycles/triangleproduct (approx)
+
+ mpn_divrem_1 20.0
+ mpn_mod_1 20.0
+ mpn_divexact_by3 11.0
+
+ mpn_l/rshift 3.0
+
+ mpn_copyi/copyd 1.0
+
+ mpn_com_n 1.5-1.85 \
+ mpn_and/andn/ior/xor_n 1.5-1.75 | varying with
+ mpn_iorn/xnor_n 2.0-2.25 | data alignment
+ mpn_nand/nior_n 2.0-2.25 /
+
+ mpn_popcount 12.5
+ mpn_hamdist 13.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+ mpn_l/rshift 1.75
+
+ mpn_copyi/copyd 0.56 or 1.0 \
+ |
+ mpn_com_n 1.0-1.2 | varying with
+ mpn_and/andn/ior/xor_n 1.2-1.5 | data alignment
+ mpn_iorn/xnor_n 1.5-2.0 |
+ mpn_nand/nior_n 1.75-2.0 /
+
+ mpn_popcount 9.0
+ mpn_hamdist 11.5
+
+
+Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster. Results so far are inconsistent. The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling. An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory. Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations. This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available. When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+ short decode is inhibited. The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+ 2 instructions before a 32 byte boundary. This makes use of the 16 byte
+ cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this
+ problem, and can be used as an equivalent, or easier is just to use a
+ different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3
+ have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+ If more than 3 bytes are needed to determine instruction length then
+ decoding degrades from direct to long, or from long to vector. This
+ happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+ with mod=00 the sib determines whether there's a displacement.
+
+ This affects all MMX and 3DNow instructions, and others with an 0F prefix
+ like movzbl. The modes affected are anything with an index and no
+ displacement, or an index but no base, and this includes (%esp) which is
+ really (,%esp,1).
+
+ The cross.pl script detects problem cases. The workaround is to always
+ use a displacement, and to do this with Zdisp if it's zero so the
+ assembler doesn't discard it.
+
+ See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+ 13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+ cycles.
+
+Various
+
+- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf 12-27 cycles
+- emms 5 cycles
+- femms 3 cycles
+- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl 20 cycles back-to-back
+- imull 2 decode, 2 execute
+- mull 2 decode, 3 execute (optimization manual decoding sample)
+- prefetch 2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+ immediate or %cl count: 11 + 2 per bit for dword
+ 13 + 4 per bit for byte
+- setCC 2 cycles
+- xchgl %eax,reg 1.5 cycles, back-to-back (strange)
+ reg,reg 2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000. This describes K6-2 and
+K6-3. Available on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997. This is an
+older edition of the above document, describing plain K6. Available
+on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used. Available on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general K6 optimizations as well as
+3DNow. Available on-line,
+
+ http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000000..31b05ada51
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,329 @@
+dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+dnl
+dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE, `FRAME+16(%esp)')
+define(PARAM_SRC2, `FRAME+12(%esp)')
+define(PARAM_SRC1, `FRAME+8(%esp)')
+define(PARAM_DST, `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp LF(M4_function_n,start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+ xorl %eax, %eax
+L(start):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC1, %ebx
+ pushl %edi
+FRAME_pushl()
+
+ movl PARAM_SRC2, %edx
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_DST, %edi
+ jae L(unroll)
+
+
+ shrl %eax C initial carry flag
+
+ C offset 0x21 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+ C
+ C The store to (%edi) could be done with a stosl; it'd be smaller
+ C code, but there's no speed gain and a cld would have to be added
+ C (per mpn/x86/README.family).
+
+ movl (%ebx), %eax
+ leal 4(%ebx), %ebx
+
+ M4_inst (%edx), %eax
+
+ movl %eax, (%edi)
+ leal 4(%edi), %edi
+
+ leal 4(%edx), %edx
+ loop L(simple)
+
+
+ movl $0, %eax
+ popl %edi
+
+ setc %al
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+ C eax carry
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ cmpl %edi, %ebx
+ pushl %esi
+
+ je L(inplace)
+
+ifdef(`OPERATION_add_n',`
+ cmpl %edi, %edx
+
+ je L(inplace_reverse)
+')
+
+ movl %ecx, %esi
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+ leal (%edi,%ecx,4), %edi
+
+ negl %ecx
+ shrl %eax
+
+ ALIGN(32)
+L(normal_top):
+ C eax counter, qwords, negative
+ C ebx src1
+ C ecx scratch
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+ M4_inst -20(%edx,%ecx,4), %eax
+ movl %eax, -20(%edi,%ecx,4)
+
+ movl 4-20(%ebx,%ecx,4), %eax
+ M4_inst 4-20(%edx,%ecx,4), %eax
+ movl %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%ebx,%ecx,4), %eax
+ M4_inst 8-20(%edx,%ecx,4), %eax
+ movl %eax, 8-20(%edi,%ecx,4)
+
+ movl 12-20(%ebx,%ecx,4), %eax
+ M4_inst 12-20(%edx,%ecx,4), %eax
+ movl %eax, 12-20(%edi,%ecx,4)
+
+ loop L(normal_top)
+
+
+ decl %esi
+ jz L(normal_finish_one)
+ js L(normal_done)
+
+ C two or three more limbs
+
+ movl (%ebx), %eax
+ M4_inst (%edx), %eax
+ movl %eax, (%edi)
+
+ movl 4(%ebx), %eax
+ M4_inst 4(%edx), %eax
+ decl %esi
+ movl %eax, 4(%edi)
+
+ jz L(normal_done)
+ movl $2, %ecx
+
+L(normal_finish_one):
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+
+L(normal_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+ C dst==src2
+
+ movl %ebx, %edx
+')
+
+L(inplace):
+ C eax initial carry
+ C ebx
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ leal -1(%ecx), %esi
+ decl %ecx
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ movl (%edx), %ebx C src low limb
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+
+ shrl %eax
+
+
+ ALIGN(32)
+L(inplace_top):
+ C eax
+ C ebx next src limb
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ M4_inst %ebx, (%edi,%ecx,4)
+
+ movl 4(%edx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+
+ M4_inst %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%edx,%ecx,4), %eax
+ movl 12-20(%edx,%ecx,4), %ebx
+
+ M4_inst %eax, 8-20(%edi,%ecx,4)
+ M4_inst %ebx, 12-20(%edi,%ecx,4)
+
+ movl 16-20(%edx,%ecx,4), %ebx
+ loop L(inplace_top)
+
+
+ C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ M4_inst %ebx, (%edi)
+
+ decl %esi
+ jz L(inplace_finish_one)
+ js L(inplace_done)
+
+ C two or three more limbs
+
+ movl 4(%edx), %eax
+ movl 8(%edx), %ebx
+ M4_inst %eax, 4(%edi)
+ M4_inst %ebx, 8(%edi)
+
+ decl %esi
+ movl $2, %ecx
+
+ jz L(normal_done)
+
+L(inplace_finish_one):
+ movl 4(%edx,%ecx,4), %eax
+ M4_inst %eax, 4(%edi,%ecx,4)
+
+L(inplace_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000000..da4120fe2f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,372 @@
+dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl
+dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
+dnl PIC adds about 6 cycles at the start.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6: large multpliers small multpliers
+dnl UNROLL_COUNT cycles/limb cycles/limb
+dnl 4 9.5 7.78
+dnl 8 9.0 7.78
+dnl 16 8.4 7.65
+dnl 32 8.4 8.2
+dnl
+dnl Maximum possible unrolling with the current code is 32.
+dnl
+dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data. Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help. It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C decode cycles
+C mull %ebp 2
+C M4_inst %esi, disp(%edi) 1
+C adcl %eax, %ecx 2
+C movl %edx, %esi \ 1
+C jnc 1f /
+C incl %esi \ 1
+C 1: movl disp(%ebx), %eax /
+C ---
+C 7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted). This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Watch this space for more info ...
+C
+C It's not clear how much branch misprediction might be costing. The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
+C --- * sum sum --- = --- * ------- * ------- = 1/4
+C b^2 x=0 y=1 b^2 b^4 2 2
+C
+C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %ebx
+ pushl %edi
+deflit(`FRAME',12)
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_DST, %edi
+
+ pushl %ebp
+deflit(`FRAME',16)
+ jae L(unroll)
+
+
+ C simple loop
+
+ movl PARAM_MULTIPLIER, %ebp
+
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl $4, %edi
+ addl %esi, %eax
+
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, %esi
+ loop L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %ebx
+ movl %esi, %eax
+
+ popl %esi
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points. Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+L(unroll):
+ C eax
+ C ebx src
+ C ecx size
+ C edx
+ C esi initial carry
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ shll $4, %edx
+ negl %ecx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%ebx), %eax C src low limb
+
+ movl PARAM_MULTIPLIER, %ebp
+ movl %edx, VAR_JUMP
+
+ mull %ebp
+
+ addl %esi, %eax C initial carry (from _1c)
+ jadcl0( %edx)
+
+
+ leal 4(%ebx,%ecx,4), %ebx
+ movl %edx, %esi C high carry
+
+ movl VAR_JUMP, %edx
+ leal (%edi,%ecx,4), %edi
+
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ jz L(noswap)
+ movl %esi, %ecx C high,low carry other way around
+
+ movl %eax, %esi
+L(noswap):
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx src
+ C ecx carry lo
+ C edx scratch
+ C esi carry hi
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes per limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+ deflit(`disp0', eval(2*i*4))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+ M4_inst %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%ebx), %ebx
+
+ jns L(top)
+
+
+ popl %ebp
+ M4_inst %ecx, UNROLL_BYTES(%edi)
+
+ popl %edi
+ movl %esi, %eax
+
+ popl %ebx
+ jadcl0( %eax)
+
+ popl %esi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
new file mode 100644
index 0000000000..21734f3e52
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/cross.pl
@@ -0,0 +1,141 @@
+#! /usr/bin/perl
+
+# Copyright (C) 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# As far as fixing problems goes, any cache line crossing problems in loops
+# get attention, but as a rule it's too tedious to rearrange code or slip in
+# nops to fix every problem in setup or finishup code.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions. All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that mode is easy to avoid when coding.
+
+use strict;
+
+sub disassemble {
+ my ($file) = @_;
+ my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+
+ open (IN, "objdump -Srfh $file |")
+ || die "Cannot open pipe from objdump\n";
+ while (<IN>) {
+ print;
+
+ if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+ if ($1 < 5) {
+ print "ZZ need at least 2**5 for predictable cache line crossing\n";
+ }
+ }
+
+ if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+ } else {
+ next;
+ }
+
+ if ($b1 =~ /0f/) {
+ $prefix = $b1;
+ $opcode = $b2;
+ $modrm = $b3;
+ } else {
+ $prefix = '';
+ $opcode = $b1;
+ $modrm = $b2;
+ }
+
+ # modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+ # for K6 and pre-CXT K6-2
+ if ($prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $modrm =~ /^[0-3][4c]/) {
+ print "ZZ ($file) >3 bytes to determine instruction length\n";
+ }
+
+ # with just an opcode, starting 1f mod 20h
+ if ($addr =~ /[13579bdf]f$/
+ && $prefix !~ /0f/
+ && $opcode !~ /1[012345]/ # adc
+ && $opcode !~ /1[89abcd]/ # sbb
+ && $opcode !~ /68/ # push $imm32
+ && $opcode !~ /^7/ # jcond disp8
+ && $opcode !~ /a[89]/ # test+imm
+ && $opcode !~ /a[a-f]/ # stos/lods/scas
+ && $opcode !~ /b8/ # movl $imm32,%eax
+ && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz
+ && $opcode !~ /e[b9]/ # jmp disp8/disp32
+ && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+ && !($opcode =~ /f[67]/ # grp 1
+ && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything starting at 1f mod 20h
+ if ($addr =~ /[13579bdf][f]$/
+ && $prefix =~ /0f/) {
+ print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+ if ($addr =~ /[13579bdf][e]$/
+ && $prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+ }
+ }
+ close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+ @files = @ARGV;
+} else {
+ @files = glob "*.asm";
+ map {s/.asm/.o/} @files;
+}
+
+foreach (@files) {
+ disassemble($_);
+}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
new file mode 100644
index 0000000000..ffb97bc380
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/diveby3.asm
@@ -0,0 +1,110 @@
+dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl K6: 11.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+C
+C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
+C lead to vector decoding, unlike plain (%esi) does.
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl PARAM_SRC, %esi
+ pushl %edi defframe_pushl(SAVE_EDI)
+
+ movl PARAM_DST, %edi
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ movl PARAM_CARRY, %ebx
+ leal (%esi,%ecx,4), %esi
+
+ pushl $3 defframe_pushl(VAR_THREE)
+ leal (%edi,%ecx,4), %edi
+
+ negl %ecx
+
+
+ C Need 32 alignment for claimed speed, to avoid the movl store
+ C opcode/modrm crossing a cache line boundary
+
+ ALIGN(32)
+L(top):
+ C eax scratch, low product
+ C ebx carry limb (0 to 3)
+ C ecx counter, limbs, negative
+ C edx scratch, high product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+ C
+ C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
+ C doesn't cross a 32 byte boundary, saving a couple of cycles
+ C (that's a fixed couple, not per loop).
+
+Zdisp( movl, 0,(%esi,%ecx,4), %eax)
+ subl %ebx, %eax
+
+ setc %bl
+
+ imull $INVERSE_3, %eax
+
+ movl %eax, (%edi,%ecx,4)
+ addl $2, %ecx
+
+ mull VAR_THREE
+
+ addl %edx, %ebx
+ loop L(top)
+
+
+ movl SAVE_ESI, %esi
+ movl %ebx, %eax
+
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_EDI, %edi
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000000..77f3948d77
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 3 /* cycles */
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME 20 /* cycles */
+#endif
+
+/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 14 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-04. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 130
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 34
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 68
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 67
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 472
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 4352
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 544
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 4352
+#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000000..20a33e6ccf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,179 @@
+dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+dnl
+dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl alignment.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6-2 aligned:
+dnl UNROLL_COUNT cycles/limb
+dnl 8 0.75
+dnl 16 0.625
+dnl 32 0.5625
+dnl 64 0.53
+dnl Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, processing limbs from high to low addresses.
+C
+C The comments in copyi.asm apply here too.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ std
+
+ movl PARAM_DST, %edi
+ cmpl $UNROLL_COUNT, %ecx
+
+ leal -4(%esi,%ecx,4), %esi
+
+ leal -4(%edi,%ecx,4), %edi
+ ja L(unroll)
+
+L(simple):
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+
+L(unroll):
+ C if src and dst are different alignments mod8, then use rep movs
+ C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+ pushl %ebx
+ leal (%esi,%edi), %ebx
+
+ testb $4, %bl
+ popl %ebx
+
+ jnz L(simple)
+ testl $4, %esi
+
+ leal -UNROLL_COUNT(%ecx), %ecx
+ jnz L(already_aligned)
+
+ movsl
+
+ decl %ecx
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+ subl $128, %esi
+ subl $128, %edi
+')
+
+ C offset 0x3D here, but gets full speed without further alignment
+L(top):
+ C eax saved esi
+ C ebx
+ C ecx counter, limbs
+ C edx saved edi
+ C esi src, incrementing
+ C edi dst, incrementing
+ C ebp
+ C
+ C `disp' is never 0, so don't need to force 0(%esi).
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+
+ leal -UNROLL_BYTES(%esi), %esi
+ subl $UNROLL_COUNT, %ecx
+
+ leal -UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+ C UNROLL_COUNT-1 limbs remaining
+
+ testb $eval(UNROLL_COUNT/2), %cl
+
+ leal UNROLL_COUNT(%ecx), %ecx
+ jz L(not_half)
+
+
+ C at an unroll count of 32 this block of code is 16 cycles faster than
+ C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+ deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+
+ subl $eval(UNROLL_BYTES/2), %esi
+ subl $eval(UNROLL_BYTES/2), %edi
+
+ subl $eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+ addl $128, %esi
+ addl $128, %edi
+')
+
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
new file mode 100644
index 0000000000..215d805f2e
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
@@ -0,0 +1,196 @@
+dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
+dnl
+dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl alignment.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6-2 aligned:
+dnl UNROLL_COUNT cycles/limb
+dnl 8 0.75
+dnl 16 0.625
+dnl 32 0.5625
+dnl 64 0.53
+dnl Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
+C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
+C used instead.
+C
+C mod8
+C src dst
+C 0 0 both aligned, use mmx
+C 0 4 unaligned, use rep movs
+C 4 0 unaligned, use rep movs
+C 4 4 do one movs, then both aligned, use mmx
+C
+C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
+C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
+C
+C A pattern of two movq loads and two movq stores (or four and four) was
+C tried, but found to be the same speed as just one of each.
+C
+C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx
+C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
+C movs.
+C
+C Enhancement:
+C
+C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd
+C make it possible to avoid incrementing %esi and %edi in the loop and hence
+C get loop overhead down to 1 cycle. Care would be needed to avoid bad
+C cache line crossings since the "movq"s would then be 5 code bytes rather
+C than 4.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_copyi)
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ cld
+
+ movl PARAM_DST, %edi
+ cmpl $UNROLL_COUNT, %ecx
+
+ ja L(unroll)
+
+L(simple):
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+
+L(unroll):
+ C if src and dst are different alignments mod8, then use rep movs
+ C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+ pushl %ebx
+ leal (%esi,%edi), %ebx
+
+ testb $4, %bl
+ popl %ebx
+
+ jnz L(simple)
+ testl $4, %esi
+
+ leal -UNROLL_COUNT(%ecx), %ecx
+ jz L(already_aligned)
+
+ decl %ecx
+
+ movsl
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+ addl $128, %esi
+ addl $128, %edi
+')
+
+ C this is offset 0x34, no alignment needed
+L(top):
+ C eax saved esi
+ C ebx
+ C ecx counter, limbs
+ C edx saved edi
+ C esi src, incrementing
+ C edi dst, incrementing
+ C ebp
+ C
+ C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
+ C 0(%edi) keeps code aligned to 16 byte boundaries.
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+Zdisp( movq, disp,(%esi), %mm0)
+Zdisp( movq, %mm0, disp,(%edi))
+')
+
+ addl $UNROLL_BYTES, %esi
+ subl $UNROLL_COUNT, %ecx
+
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+ C UNROLL_COUNT-1 limbs remaining
+
+ testb $eval(UNROLL_COUNT/2), %cl
+
+ leal UNROLL_COUNT(%ecx), %ecx
+ jz L(not_half)
+
+ C at an unroll count of 32 this block of code is 16 cycles faster than
+ C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+ deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+ addl $eval(UNROLL_BYTES/2), %esi
+ addl $eval(UNROLL_BYTES/2), %edi
+
+ subl $eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+ subl $128, %esi
+ subl $128, %edi
+')
+
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000000..f6d54f97a8
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,286 @@
+dnl AMD K6-2 mpn_lshift -- mpn left shift.
+dnl
+dnl K6-2: 1.75 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ movl %edx, VAR_RETVAL
+ jae L(unroll)
+
+
+ movd %ecx, %mm7
+ movl %eax, %ecx
+
+ movl PARAM_DST, %eax
+
+L(simple):
+ C eax dst
+ C ebx src
+ C ecx counter, size-1 to 1
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%ecx,4), %mm0
+
+ psrlq %mm7, %mm0
+
+Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
+ loop L(simple)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+
+ movd %mm0, (%eax)
+ movl %edx, %eax
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval (but instead VAR_RETVAL is used)
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ movl PARAM_DST, %edx
+
+ movd %ecx, %mm7
+ subl $7, %eax C size-8
+
+ leal (%edx,%eax,4), %ecx C alignment of dst
+
+ movq 32-8(%ebx,%eax,4), %mm2 C src high qword
+ testb $4, %cl
+
+ jz L(dst_aligned)
+ psllq %mm6, %mm2
+
+ psrlq $32, %mm2
+ decl %eax
+
+ movd %mm2, 32(%edx,%eax,4) C dst high limb
+ movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
+L(dst_aligned):
+
+ movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of size-8 lets the loop stop when %eax goes negative and
+ C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, size-8 step by -4 until <0
+ C ebx src
+ C ecx
+ C edx dst
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psllq %mm6, %mm2
+ subl $4, %eax
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 24(%ebx,%eax,4), %mm0
+
+ psllq %mm6, %mm1
+ movq %mm2, 40(%edx,%eax,4)
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm1, 32(%edx,%eax,4)
+ jnc L(top)
+
+
+ C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+ C
+ C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+ C %eax is between -4 and -1, representing respectively 0 to 3 extra
+ C limbs that must be read.
+
+
+ testl $2, %eax C testl to avoid bad cache line crossing
+ jz L(finish_nottwo)
+
+ C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+ C new mm2 and a new mm0 is loaded.
+
+ psllq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psrlq %mm7, %mm0
+ subl $2, %eax
+
+ por %mm0, %mm2
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm2, 32(%edx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+ testb $1, %al
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ psllq %mm6, %mm1
+
+ movq %mm2, 24(%edx,%eax,4)
+ jz L(finish_even)
+
+
+ C Size is odd, so mm1 and one extra limb to process.
+
+ movd (%ebx), %mm0 C src[0]
+ popl %ebx
+deflit(`FRAME',0)
+
+ movq %mm0, %mm2
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ psllq %mm6, %mm2
+ por %mm0, %mm1
+
+ movq %mm1, 4(%edx) C dst[1,2]
+ movd %mm2, (%edx) C dst[0]
+
+ movl VAR_RETVAL, %eax
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+ C Size is even, so only mm1 left to process.
+
+ movq %mm1, (%edx) C dst[0,1]
+ movl VAR_RETVAL, %eax
+
+ popl %ebx
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000000..8a8c144241
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,285 @@
+dnl AMD K6-2 mpn_rshift -- mpn right shift.
+dnl
+dnl K6-2: 1.75 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+
+
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ movl PARAM_DST, %ecx
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+ C This loop runs at about 3 cycles/limb, which is the amount of
+ C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+ C eax counter, -(size-1) to -1
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ subl $7, %eax C size-8
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+ movq (%ebx), %mm2 C src low qword
+ leal (%ebx,%eax,4), %ebx C src end - 32
+
+ testb $4, %cl
+ leal (%ecx,%eax,4), %ecx C dst end - 32
+
+ notl %eax C -(size-7)
+ jz L(dst_aligned)
+
+ psrlq %mm6, %mm2
+ incl %eax
+
+Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
+ movq 4(%ebx,%eax,4), %mm2 C new src low qword
+L(dst_aligned):
+
+ movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
+ nop C avoid bad cache line crossing
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+ C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, -(size-7) step by +4 until >=0
+ C ebx src end - 32
+ C ecx dst end - 32
+ C edx retval
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psrlq %mm6, %mm2
+ addl $4, %eax
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 4(%ebx,%eax,4), %mm0
+
+ psrlq %mm6, %mm1
+ movq %mm2, -12(%ecx,%eax,4)
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm1, -4(%ecx,%eax,4)
+ ja L(top) C jump if no carry and not zero
+
+
+
+ C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+ C to 3 representing respectively 3 to 0 further limbs.
+
+ testl $2, %eax C testl to avoid bad cache line crossings
+ jnz L(finish_nottwo)
+
+ C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+ C becomes new mm2 and a new mm0 is loaded.
+
+ psrlq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ addl $2, %eax
+
+ por %mm0, %mm2
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm2, -4(%ecx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ testb $1, %al
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ psrlq %mm6, %mm1
+
+ movq %mm2, 4(%ecx,%eax,4)
+ jnz L(finish_even)
+
+
+ C one further extra limb to process
+
+ movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
+ popl %ebx
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ psrlq %mm6, %mm2
+
+ movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
+ movd %mm2, 32-4(%ecx) C dst[size-1]
+
+ movl %edx, %eax C retval
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+ C no further extra limbs
+
+ movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
+ movl %edx, %eax C retval
+
+ popl %ebx
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
new file mode 100644
index 0000000000..8915080f0f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -0,0 +1,91 @@
+dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
+dnl
+dnl alignment dst/src, A=0mod8 N=4mod8
+dnl A/A A/N N/A N/N
+dnl K6-2 1.0 1.18 1.18 1.18 cycles/limb
+dnl K6 1.5 1.85 1.75 1.85
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_com_n)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ shrl %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ notl %eax
+ movl %eax, (%edx)
+ ret
+
+
+L(two_or_more):
+ pushl %ebx
+FRAME_pushl()
+ movl %ecx, %ebx
+
+ pcmpeqd %mm7, %mm7 C all ones
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx floor(size/2)
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+ movq -8(%eax,%ecx,8), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, -8(%edx,%ecx,8)
+ loop L(top)
+
+
+ jnc L(no_extra)
+ movl (%eax,%ebx,8), %eax
+ notl %eax
+ movl %eax, (%edx,%ebx,8)
+L(no_extra):
+
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000000..46cb3b7ea5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,212 @@
+dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+dnl
+dnl alignment dst/src1/src2, A=0mod8, N=4mod8
+dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+dnl
+dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
+dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
+dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
+dnl
+dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
+dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
+dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl M4_p and M4_i are the MMX and integer instructions
+dnl M4_*_neg_dst means whether to negate the final result before writing
+dnl M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p', `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i', `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl style (the two are equivalent for xor).
+
+M4_choose_op( and_n, pand,0,0, andl,0,0)
+M4_choose_op( andn_n, pandn,0,0, andl,0,1)
+M4_choose_op( nand_n, pand,1,0, andl,1,0)
+M4_choose_op( ior_n, por,0,0, orl,0,0)
+M4_choose_op( iorn_n, por,0,1, orl,0,1)
+M4_choose_op( nior_n, por,1,0, orl,1,0)
+M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones. The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
+C K6 1.75 2.2 2.0 2.28 iorn,xnor
+C K6 2.0 2.25 2.35 2.28 nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+PROLOGUE(M4_function)
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+ FRAME_pushl()
+ movl PARAM_SRC1, %eax
+ movl PARAM_SRC2, %ebx
+ cmpl $1, %ecx
+ movl PARAM_DST, %edx
+ ja L(two_or_more)
+
+
+ movl (%ebx), %ecx
+ popl %ebx
+ifelse(M4_i_neg_src2,1,`notl %ecx')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl %ecx')
+ movl %ecx, (%edx)
+
+ ret
+
+
+L(two_or_more):
+ C eax src1
+ C ebx src2
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ pushl %esi
+ FRAME_pushl()
+ testl $4, %eax
+ jz L(alignment_ok)
+
+ movl (%ebx), %esi
+ addl $4, %ebx
+ifelse(M4_i_neg_src2,1,`notl %esi')
+ M4_i (%eax), %esi
+ addl $4, %eax
+ifelse(M4_i_neg_dst,1,` notl %esi')
+ movl %esi, (%edx)
+ addl $4, %edx
+ decl %ecx
+
+L(alignment_ok):
+ movl %ecx, %esi
+ shrl %ecx
+ jnz L(still_two_or_more)
+
+ movl (%ebx), %ecx
+ popl %esi
+ifelse(M4_i_neg_src2,1,`notl %ecx')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl %ecx')
+ popl %ebx
+ movl %ecx, (%edx)
+ ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+ pcmpeqd %mm7, %mm7 C all ones
+')
+
+ ALIGN(16)
+L(top):
+ C eax src1
+ C ebx src2
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ movq -8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
+ M4_p -8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
+ movq %mm0, -8(%edx,%ecx,8)
+
+ loop L(top)
+
+
+ jnc L(no_extra)
+
+ movl -4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl %ebx')
+ M4_i -4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,` notl %ebx')
+ movl %ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+ popl %esi
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000000..f1dc83db46
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,122 @@
+dnl AMD K6 mpn_lshift -- mpn left shift.
+dnl
+dnl K6: 3.0 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+ nop C avoid bad cache line crossing
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+L(top):
+ C eax counter, size-1 to 1
+ C ebx src
+ C ecx dst
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%ecx,%eax,4)
+ jnz L(top)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+ movl %edx, %eax
+
+ movd %mm0, (%ecx)
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000000..2c619252bb
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,238 @@
+dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl hamming distance.
+dnl
+dnl popcount hamdist
+dnl K6-2: 9.0 11.5 cycles/limb
+dnl K6: 12.5 13.0
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ DATA
+ ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+
+LS(rodata_000000FF000000FF):
+ .long 0x000000FF
+ .long 0x000000FF
+')
+
+ .text
+ ALIGN(32)
+
+POP(`ifdef(`PIC', `
+ C avoid shrl crossing a 32-byte boundary
+ nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ orl %ecx, %ecx
+ jz L(zero)
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+ movl $0x000000FF, %edx
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+ punpckldq %mm4, %mm4
+',`
+
+ movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq LS(rodata_3333333333333333), %mm6
+ movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+ movq LS(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
+ pxor %mm0, %mm1
+')
+
+ incl %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+POP(` nop C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+HAM(` nop C code alignment')
+
+ psubd %mm1, %mm0 C bit pairs
+HAM(` nop C code alignment')
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+ movq %mm0, %mm1
+ psrlq $8, %mm0
+
+
+ paddb %mm1, %mm0 C words
+
+
+ movq %mm0, %mm1
+ psrlq $16, %mm0
+
+ paddd %mm1, %mm0 C dwords
+
+ pand REG_000000FF000000FF, %mm0
+
+ paddd %mm0, %mm2 C low to total
+ psrlq $32, %mm0
+
+ paddd %mm0, %mm2 C high to total
+ loop L(top)
+
+
+
+ movd %mm2, %eax
+ emms_or_femms
+ ret
+
+L(zero):
+ movl $0, %eax
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000000..cc5948f26c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,122 @@
+dnl AMD K6 mpn_rshift -- mpn right shift.
+dnl
+dnl K6: 3.0 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx C 32-shift
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx C retval
+ movl PARAM_DST, %ecx
+
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+
+L(simple):
+ C eax counter (negative)
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000000..c2220fe4ca
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,272 @@
+dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
+dnl
+dnl K6: 6.25 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ mov PARAM_SIZE, %ecx
+ push %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ push %edi
+FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+FRAME_pushl()
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ jae L(unroll)
+
+
+ C code offset 0x22 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+ C
+ C this loop 8 cycles/limb
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(simple)
+
+
+ popl %ebp
+
+ popl %edi
+ popl %ebx
+
+ movl %esi, %eax
+ popl %esi
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end. Rotating the load/store to the top
+C is an 0.5 c/l slowdown. (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+ ALIGN(16) C already aligned to 16 here actually
+L(unroll):
+ movl (%ebx), %eax
+ leal -16(%ebx,%ecx,4), %ebx
+
+ leal -16(%edi,%ecx,4), %edi
+ subl $4, %ecx
+
+ negl %ecx
+
+
+ ALIGN(16) C one byte nop for this alignment
+L(top):
+ C eax scratch
+ C ebx &src[size-4]
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi &dst[size-4]
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi,%ecx,4)
+ movl 12(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 12(%edi,%ecx,4)
+ movl 16(%ebx,%ecx,4), %eax
+
+
+ addl $4, %ecx
+ js L(top)
+
+
+
+ C eax next src limb
+ C ebx &src[size-4]
+ C ecx 0 to 3 representing respectively 4 to 1 further limbs
+ C edx
+ C esi carry
+ C edi &dst[size-4]
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+ addl $2, %ecx
+L(finish_not_two):
+
+
+ testb $1, %cl
+ jnz L(finish_not_one)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi)
+ movl 12(%ebx), %eax
+L(finish_not_one):
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ popl %ebp
+
+ adcl $0, %edx
+
+ movl %eax, 12(%edi)
+ popl %edi
+
+ popl %ebx
+ movl %edx, %eax
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000000..1f5a3a4b4b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,600 @@
+dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+dnl
+dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+dnl unrolling).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6: UNROLL_COUNT cycles/product (approx)
+dnl 8 9.75
+dnl 16 9.3
+dnl 32 9.3
+dnl Maximum possible with the current code is 32.
+dnl
+dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once. The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Future:
+C
+C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
+C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be
+C 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two_limbs)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ movl (%edx), %edx C xp low limb
+ movl PARAM_WP, %ecx
+
+ mull %edx
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+ decl PARAM_YSIZE
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_WP, %ebx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl %eax, %ecx C yp low limb
+ movl (%edx), %eax C xp low limb
+
+ movl %edx, %esi C xp
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+ movl %esi, 4(%ebx)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi
+deflit(`FRAME',12)
+ movl %eax, (%ebx)
+
+ movl 4(%esi), %eax
+ movl %edx, %edi C carry, for wp[1]
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+
+ movl %edi, 4(%ebx)
+ movl 4(%ecx), %ecx C yp[1]
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+
+ movl (%esi), %eax C xp[0]
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ adcl $0, %esi
+
+ movl %edi, 8(%ebx)
+ popl %edi
+
+ movl %esi, 12(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',0)
+
+ pushl %edi defframe_pushl(SAVE_EDI)
+ pushl %ebp defframe_pushl(SAVE_EBP)
+
+ movl PARAM_WP, %edi
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl %eax, %ebp
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ leal (%edx,%ecx,4), %ebx C xp end
+ xorl %esi, %esi
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi) C final carry
+
+ movl PARAM_XSIZE, %ecx
+ decl %edx
+
+ jnz L(ysize_more_than_one_limb)
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+L(ysize_more_than_one_limb):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop. The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+ C eax yp
+ C ebx xp end
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1
+ C ebp
+
+ movl 4(%eax), %ebp C multiplier
+ negl %ecx
+
+ movl %ecx, PARAM_XSIZE C -xsize
+ xorl %esi, %esi C initial carry
+
+ leal 4(%eax,%edx,4), %eax C yp end
+ negl %edx
+
+ movl %eax, PARAM_YP
+ movl %edx, PARAM_YSIZE
+
+ jmp L(simple_outer_entry)
+
+
+ C aligning here saves a couple of cycles
+ ALIGN(16)
+L(simple_outer_top):
+ C edx ysize counter, negative
+
+ movl PARAM_YP, %eax C yp end
+ xorl %esi, %esi C carry
+
+ movl PARAM_XSIZE, %ecx C -xsize
+ movl %edx, PARAM_YSIZE
+
+ movl (%eax,%edx,4), %ebp C yp limb multiplier
+L(simple_outer_entry):
+ addl $4, %edi
+
+
+L(simple_inner):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of this addmul
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl $0, %edx
+ addl %eax, (%edi,%ecx,4)
+ adcl %edx, %esi
+
+ incl %ecx
+ jnz L(simple_inner)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi)
+
+ incl %edx
+ jnz L(simple_outer_top)
+
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop. This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl slower in this code, hence the defframe empty-if-zero feature is
+dnl disabled.
+dnl
+dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
+dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+ C eax yp (not used)
+ C ebx xp end (not used)
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1 (not used)
+ C ebp
+deflit(`FRAME', 16)
+
+ leal -2(%ecx), %ebp C one limb processed at start,
+ decl %ecx C and ebp is one less
+
+ shrl $UNROLL_LOG2, %ebp
+ negl %ecx
+
+ subl $VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+ andl $UNROLL_MASK, %ecx
+
+ movl %ecx, %esi
+ shll $4, %ecx
+
+ movl %ebp, VAR_COUNTER_INIT
+ negl %esi
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+ movl PARAM_XP, %ebx
+ movl %ebp, VAR_COUNTER
+
+ movl PARAM_WP, %edi
+ movl %ecx, VAR_JMP
+
+ movl (%ebx), %eax
+ leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
+
+ leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
+
+ movl %eax, VAR_XP_LOW
+
+ movl %ebx, PARAM_XP
+ movl PARAM_YP, %ebx
+
+ leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
+ movl 4(%ebx), %ebp C multiplier (yp second limb)
+
+ leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
+
+ movl %ecx, PARAM_WP
+
+ leal 1(%esi), %ecx C adjust parity for decl %ecx above
+
+ movl %ebx, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%ecx,%esi,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+ C Aligning here saves a couple of cycles per loop. Using 32 doesn't
+ C cost any extra space, since the inner unrolled loop below is
+ C aligned to 32.
+ ALIGN(32)
+L(unroll_outer_top):
+ C edx ysize
+
+ movl PARAM_YP, %eax
+ movl %edx, PARAM_YSIZE C incremented ysize counter
+
+ movl PARAM_WP, %edi
+
+ movl VAR_COUNTER_INIT, %ebx
+ movl (%eax,%edx,4), %ebp C next multiplier
+
+ movl PARAM_XSIZE, %ecx
+ leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
+
+ movl VAR_XP_LOW, %eax
+ movl %ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ C using testb is a tiny bit faster than testl
+ testb $1, %cl
+
+ movl %eax, %ecx C low carry
+ movl VAR_JMP, %eax
+
+ movl %edx, %esi C high carry
+ movl PARAM_XP, %ebx
+
+ jnz L(unroll_noswap)
+ movl %ecx, %esi C high,low carry other way around
+
+ movl %edx, %ecx
+L(unroll_noswap):
+
+ jmp *%eax
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax scratch
+ C ebx xp
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi wp
+ C ebp multiplier
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes each limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4))
+ deflit(`disp1', eval(disp0 + 4))
+ deflit(`disp2', eval(disp1 + 4))
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+Zdisp( addl, %ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp2(%ebx), %eax
+ mull %ebp
+ addl %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%ebx), %ebx
+
+ jns L(unroll_top)
+
+
+ movl PARAM_YSIZE, %edx
+ addl %ecx, UNROLL_BYTES(%edi)
+
+ adcl $0, %esi
+
+ incl %edx
+ movl %esi, UNROLL_BYTES+4(%edi)
+
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ addl $FRAME, %esp
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000000..70d49b3e57
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,672 @@
+dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+dnl product (measured on the speed difference between 17 and 33 limbs,
+dnl which is roughly the Karatsuba recursing range).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
+dnl code supports. This value is used only by the tune program to know
+dnl what it can go up to. (An attempt to compile with a bigger value will
+dnl trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl The value is determined by requiring the displacements in the unrolled
+dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
+dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+
+dnl Allow a value from the tune program to override config.m4.
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+
+dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
+dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since
+dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
+dnl
+dnl The first addmul is the biggest, and this takes the second least
+dnl significant limb and multiplies it by the third least significant and
+dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
+dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %ecx
+ je L(two_limbs)
+
+ movl PARAM_DST, %edx
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+deflit(`FRAME',4)
+
+ movl (%ebx), %eax
+ movl PARAM_DST, %ecx
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0]*src[1]
+
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ popl %ebx
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+ cmpl $4, %ecx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+
+ movl (%ebx), %eax
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+ pushl %esi
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ pushl %edi
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 20(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+
+ movl %edx, %edi
+ movl 8(%ebx), %edx
+
+ pushl %ebp
+ xorl %ebp, %ebp
+
+ mull %edx C src[0] * src[2]
+
+ addl %eax, %edi
+ movl 4(%ebx), %eax
+
+ adcl %edx, %ebp
+
+ movl 8(%ebx), %edx
+
+ mull %edx C src[1] * src[2]
+
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+
+ C eax will be dst[5]
+ C ebx
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ xorl %eax, %eax
+ addl %esi, %esi
+ adcl %edi, %edi
+ adcl %ebp, %ebp
+ adcl %edx, %edx
+ adcl $0, %eax
+
+ addl %esi, 4(%ecx)
+ adcl %edi, 8(%ecx)
+ adcl %ebp, 12(%ecx)
+
+ popl %ebp
+ popl %edi
+
+ adcl %edx, 16(%ecx)
+
+ popl %esi
+ popl %ebx
+
+ adcl %eax, 20(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(STACK_SPACE, 24)
+
+ ALIGN(16)
+L(four_or_more):
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ leal 4(%edx), %edi
+
+ movl %ebx, SAVE_EBX
+ leal 4(%eax), %ebx
+
+ movl %esi, SAVE_ESI
+ xorl %esi, %esi
+
+ movl %ebp, SAVE_EBP
+
+ C eax
+ C ebx src+4
+ C ecx size
+ C edx
+ C esi
+ C edi dst+4
+ C ebp
+
+ movl (%eax), %ebp C multiplier
+ leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
+
+
+ ALIGN(16)
+L(mul_1):
+ C eax scratch
+ C ebx src ptr
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst ptr
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time. The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31. Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+ C eax
+ C ebx &src[size]
+ C ecx
+ C edx
+ C esi carry
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %ebx')
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %edi')
+
+ negl %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limitation UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+ C
+ ASSERT(ae,`
+ movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx
+ ')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx &src[size], constant
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi high limb to store
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+ testb $1, %cl
+
+ movl %edx, %esi C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ movl %edx, VAR_JMP
+ leal 4(%edi), %edi
+
+ C A branch-free version of this using some xors was found to be a
+ C touch slower than just a conditional jump, despite the jump
+ C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+ jz,jnz) L(unroll_noswap)
+ movl %esi, %eax C high,low carry other way around
+
+ movl %ecx, %esi
+ movl %eax, %ecx
+L(unroll_noswap):
+
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/esi should start.
+ C
+ C An attempt was made at padding here to get the end of the unrolled
+ C code to come out on a good alignment, to save padding before
+ C L(corner). This worked, but turned out to run slower than just an
+ C ALIGN(2). The reason for this is not clear, it might be related
+ C to the different speeds on different UNROLL_COUNTs noted above.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx src
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %esi, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+ addl %esi, -4+OFFSET(%edi)
+
+ movl VAR_COUNTER, %edx
+ jadcl0( %ecx)
+
+ movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %ebx
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C ebx &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%ebx), %ebp
+
+ movl -8(%ebx), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ adcl $0, %edx
+
+ movl -4(%ebx), %eax
+ movl %edx, %esi
+ movl %eax, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ adcl $0, %edx
+
+ addl %eax, (%edi)
+ adcl $0, %edx
+
+ movl %edx, %esi
+ movl %ebx, %eax
+
+ mull %ecx
+
+ addl %esi, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_DST, %edi
+ subl $1, %ecx C size-1 and clear carry
+
+ movl PARAM_SRC, %ebx
+ movl %ecx, %edx
+
+ xorl %eax, %eax C ready for adcl
+
+
+ ALIGN(16)
+L(lshift):
+ C eax
+ C ebx src (for later use)
+ C ecx counter, decrementing
+ C edx size-1 (for later use)
+ C esi
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+ leal 8(%edi), %edi
+ loop L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%ebx), %eax C src[0]
+
+ leal 4(%ebx,%edx,4), %ebx C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+ ALIGN(16)
+L(diag):
+ C eax scratch
+ C ebx &src[size]
+ C ecx counter, negative
+ C edx carry
+ C esi scratch
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ movl %edx, %esi
+
+ mull %eax
+
+ addl %esi, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000000..c34315c401
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/README
@@ -0,0 +1,145 @@
+
+ AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions. All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+ cycles/limb
+ mpn_add/sub_n 1.6
+
+ mpn_copyi 0.75 or 1.0 \ varying with data alignment
+ mpn_copyd 0.75 or 1.0 /
+
+ mpn_divrem_1 17.0 integer part, 15.0 fractional part
+ mpn_mod_1 17.0
+ mpn_divexact_by3 8.0
+
+ mpn_l/rshift 1.2
+
+ mpn_mul_1 3.4
+ mpn_addmul/submul_1 3.9
+
+ mpn_mul_basecase 4.42 cycles/crossproduct (approx)
+
+ mpn_popcount 5.0
+ mpn_hamdist 6.0
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on
+the speed of the multiplication routines. The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling. An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies. K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl direct
+divl 39 cycles back-to-back
+lodsl,etc vector
+loop 1 cycle vector (decl/jnz opens up one decode slot)
+movd reg vector
+movd mem direct
+mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl vector (use movl for more than one pop)
+pushl direct, will pair with a load
+shrdl %cl vector, 3 cycles, seems to be 3 decode too
+xorl r,r false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision E, November 1999. Available on-line,
+
+ http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions. Available on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision B, August 1999. This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms. Available on-line,
+
+ http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general Athlon optimizations as well as
+3DNow. Available on-line,
+
+ http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000000..85fa9d3036
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,250 @@
+dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+dnl
+dnl K7: 1.64 cycles/limb (at 16 limb/loop).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 8 1.9
+dnl 16 1.64
+dnl 32 1.7
+dnl 64 2.0
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which is probably the best possible
+C with plain integer operations. Each limb is 2 loads and 1 store, and in
+C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
+C c/l.
+
+dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+ .text
+ ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp LF(M4_function_n,start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+ xorl %eax, %eax C carry
+L(start):
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl %ebx, SAVE_EBX
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_SRC2, %edx
+ movl PARAM_SRC1, %ebx
+ jae L(unroll)
+
+ movl PARAM_DST, %edi
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+ shrl %eax
+
+ C This loop in in a single 16 byte code block already, so no
+ C alignment necessary.
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+ jnz L(simple)
+
+ movl $0, %eax
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ setc %al
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+ movl %ebp, SAVE_EBP
+ andl $-2, %ecx C size low bit masked out
+ andl $1, PARAM_SIZE C size low bit kept
+
+ movl %ecx, %edi
+ decl %ecx
+ movl PARAM_DST, %ebp
+
+ shrl $UNROLL_LOG2, %ecx
+ negl %edi
+ movl %esi, SAVE_ESI
+
+ andl $UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
+')
+ negl %edi
+ shrl %eax
+
+ leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+ leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%edi,%edi,8), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax zero
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi scratch (was computed jump)
+ C edi dst
+ C ebp scratch
+
+ leal UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %esi)
+ movl disp1(%ebx), %ebp
+Zdisp( M4_inst,disp0,(%edx), %esi)
+Zdisp( movl, %esi, disp0,(%edi))
+ M4_inst disp1(%edx), %ebp
+ movl %ebp, disp1(%edi)
+')
+
+ decl %ecx
+ leal UNROLL_BYTES(%ebx), %ebx
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ mov PARAM_SIZE, %esi
+ movl SAVE_EBP, %ebp
+ movl $0, %eax
+
+ decl %esi
+ js L(even)
+
+ movl (%ebx), %ecx
+ M4_inst UNROLL_BYTES(%edx), %ecx
+ movl %ecx, (%edi)
+L(even):
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ setc %al
+
+ movl SAVE_ESI, %esi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000000..9f9c3daaf4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,364 @@
+dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl
+dnl K7: 3.9 cycles/limb.
+dnl
+dnl Future: It should be possible to avoid the separate mul after the
+dnl unrolled loop by moving the movl/adcl to the top.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 4.42
+dnl 8 4.16
+dnl 16 3.9
+dnl 32 3.9
+dnl 64 3.87
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1',`
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(SAVE_SIZE, 16)
+
+ .text
+ ALIGN(32)
+PROLOGUE(M4_function_1)
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+ xorl %ecx, %ecx
+
+ decl %edx
+ jnz LF(M4_function_1c,start_1)
+
+ movl (%eax), %eax
+ movl PARAM_DST, %ecx
+
+ mull PARAM_MULTIPLIER
+
+ M4_inst %eax, (%ecx)
+ adcl $0, %edx
+ movl %edx, %eax
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(M4_function_1c)
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ decl %edx
+ jnz L(more_than_one_limb)
+
+ movl (%eax), %eax
+ movl PARAM_DST, %ecx
+
+ mull PARAM_MULTIPLIER
+
+ addl PARAM_CARRY, %eax
+
+ adcl $0, %edx
+ M4_inst %eax, (%ecx)
+
+ adcl $0, %edx
+ movl %edx, %eax
+
+ ret
+
+
+ C offset 0x44 so close enough to aligned
+L(more_than_one_limb):
+ movl PARAM_CARRY, %ecx
+L(start_1):
+ C eax src
+ C ecx initial carry
+ C edx size-1
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',16)
+
+ movl %ebx, SAVE_EBX
+ movl %esi, SAVE_ESI
+ movl %edx, %ebx C size-1
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+ cmpl $UNROLL_THRESHOLD, %edx
+
+ movl PARAM_MULTIPLIER, %ebp
+ movl %edi, SAVE_EDI
+
+ movl (%esi), %eax C src low limb
+ movl PARAM_DST, %edi
+ ja L(unroll)
+
+
+ C simple loop
+
+ leal 4(%esi,%ebx,4), %esi C point one limb past last
+ leal (%edi,%ebx,4), %edi C point at last limb
+ negl %ebx
+
+ C The movl to load the next source limb is done well ahead of the
+ C mul. This is necessary for full speed, and leads to one limb
+ C handled separately at the end.
+
+L(simple):
+ C eax src limb
+ C ebx loop counter
+ C ecx carry limb
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ecx
+ adcl $0, %edx
+
+ M4_inst %ecx, (%edi,%ebx,4)
+ movl (%esi,%ebx,4), %eax
+ adcl $0, %edx
+
+ incl %ebx
+ movl %edx, %ecx
+ jnz L(simple)
+
+
+ mull %ebp
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+
+ addl %eax, %ecx
+ adcl $0, %edx
+
+ M4_inst %ecx, (%edi)
+ adcl $0, %edx
+ movl SAVE_EDI, %edi
+
+ addl $SAVE_SIZE, %esp
+ movl %edx, %eax
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax src low limb
+ C ebx size-1
+ C ecx carry
+ C edx size-1
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+dnl overlapping with parameters no longer needed
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP, `PARAM_MULTIPLIER')
+
+ subl $2, %ebx C (size-2)-1
+ decl %edx C size-2
+
+ shrl $UNROLL_LOG2, %ebx
+ negl %edx
+
+ movl %ebx, VAR_COUNTER
+ andl $UNROLL_MASK, %edx
+
+ movl %edx, %ebx
+ shll $4, %edx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edx,%ebx,1), %edx
+')
+ negl %ebx
+ movl %edx, VAR_JUMP
+
+ mull %ebp
+
+ addl %eax, %ecx C initial carry, becomes low carry
+ adcl $0, %edx
+ testb $1, %bl
+
+ movl 4(%esi), %eax C src second limb
+ leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
+
+ movl %edx, %ebx C high carry
+ cmovnz( %ecx, %ebx) C high,low carry other way around
+ cmovnz( %edx, %ecx)
+
+ jmp *VAR_JUMP
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%edx,%ebx,1), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+C This code uses a "two carry limbs" scheme. At the top of the loop the
+C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
+C the computed jump an odd size means they start one way around, an even
+C size the other. Either way one limb is handled separately at the start of
+C the loop.
+C
+C The positioning of the movl to load the next source limb is important.
+C Moving it after the adcl with a view to avoiding a separate mul at the end
+C of the loop slows the code down.
+
+ ALIGN(32)
+L(top):
+ C eax src limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src+8
+ C edi dst
+ C ebp multiplier
+ C
+ C VAR_COUNTER loop counter
+ C
+ C 17 bytes each limb
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+ mull %ebp
+
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+Zdisp( movl, disp0,(%esi), %eax)
+ adcl %edx, %ecx
+
+
+ mull %ebp
+
+ M4_inst %ebx, disp1(%edi)
+ movl $0, %ebx
+
+ adcl %eax, %ecx
+
+ movl disp1(%esi), %eax
+ adcl %edx, %ebx
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ leal UNROLL_BYTES(%edi), %edi
+
+ jns L(top)
+
+
+ C eax src limb
+ C ebx carry high
+ C ecx carry low
+ C edx
+ C esi
+ C edi dst (points at second last limb)
+ C ebp multiplier
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+ mull %ebp
+
+ M4_inst %ecx, disp0(%edi)
+ movl SAVE_EBP, %ebp
+
+ adcl %ebx, %eax
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+ M4_inst %eax, disp1(%edi)
+ movl SAVE_EDI, %edi
+
+ adcl $0, %edx
+ addl $SAVE_SIZE, %esp
+
+ movl %edx, %eax
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm
new file mode 100644
index 0000000000..57684958a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/diveby3.asm
@@ -0,0 +1,131 @@
+dnl AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl K7: 8.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+dnl ceil(b/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL, 0x55555556)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %ecx
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ movl PARAM_CARRY, %ebx
+ pushl %ebp defframe_pushl(SAVE_EBP)
+
+ movl PARAM_SIZE, %ebp
+ pushl %edi defframe_pushl(SAVE_EDI)
+
+ movl (%ecx), %eax C src low limb
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl PARAM_DST, %edi
+ movl $TWO_THIRDS_FLOOR, %esi
+ leal -4(%ecx,%ebp,4), %ecx C &src[size-1]
+
+ subl %ebx, %eax
+
+ setc %bl
+ decl %ebp
+ jz L(last)
+
+ leal (%edi,%ebp,4), %edi C &dst[size-1]
+ negl %ebp
+
+
+ ALIGN(16)
+L(top):
+ C eax src limb, carry subtracted
+ C ebx carry limb (0 or 1)
+ C ecx &src[size-1]
+ C edx scratch
+ C esi TWO_THIRDS_FLOOR
+ C edi &dst[size-1]
+ C ebp counter, limbs, negative
+
+ imull $INVERSE_3, %eax, %edx
+
+ movl 4(%ecx,%ebp,4), %eax C next src limb
+ cmpl $ONE_THIRD_CEIL, %edx
+
+ sbbl $-1, %ebx C +1 if result>=ceil(b/3)
+ cmpl %edx, %esi
+
+ sbbl %ebx, %eax C and further 1 if result>=ceil(b*2/3)
+ movl %edx, (%edi,%ebp,4)
+ incl %ebp
+
+ setc %bl C new carry
+ jnz L(top)
+
+
+
+L(last):
+ C eax src limb, carry subtracted
+ C ebx carry limb (0 or 1)
+ C ecx &src[size-1]
+ C edx scratch
+ C esi multiplier
+ C edi &dst[size-1]
+ C ebp
+
+ imull $INVERSE_3, %eax
+
+ cmpl $ONE_THIRD_CEIL, %eax
+ movl %eax, (%edi)
+ movl SAVE_EBP, %ebp
+
+ sbbl $-1, %ebx C +1 if eax>=ceil(b/3)
+ cmpl %eax, %esi
+ movl $0, %eax
+
+ adcl %ebx, %eax C further +1 if eax>=ceil(b*2/3)
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000000..c3bba0afc4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,100 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* the low limb is ready after 4 cycles, but normally it's the high limb
+ which is of interest, and that comes out after 6 cycles */
+#ifndef UMUL_TIME
+#define UMUL_TIME 6 /* cycles */
+#endif
+
+/* AMD doco says 40, but it measures 39 back-to-back */
+#ifndef UDIV_TIME
+#define UDIV_TIME 39 /* cycles */
+#endif
+
+/* using bsf */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 7 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 26
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 177
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 173
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 76
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 114
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 34
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 720, 1440, 2944, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 736
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 6912
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 784, 1696, 3200, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 800
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 8448
+#endif
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000000..33214daa1f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,136 @@
+dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
+dnl
+dnl alignment dst/src, A=0mod8 N=4mod8
+dnl A/A A/N N/A N/N
+dnl K7 0.75 1.0 1.0 0.75
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src
+ C ebx scratch
+ C ecx counter
+ C edx dst
+ C
+ C this loop is 2 cycles/limb
+
+ movl -4(%eax,%ecx,4), %ebx
+ movl %ebx, -4(%edx,%ecx,4)
+ decl %ecx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %esi, SAVE_ESI
+ leal (%eax,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %esi
+
+ andl %esi, %ebx
+ movl SAVE_ESI, %esi
+ subl $4, %ecx C size-4
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl 12(%eax,%ecx,4), %ebx
+ movl %ebx, 12(%edx,%ecx,4)
+ decl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, limbs
+ C edx dst
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq (%eax,%ecx,4), %mm1
+ subl $4, %ecx
+ movq %mm0, 16+8(%edx,%ecx,4)
+ movq %mm1, 16(%edx,%ecx,4)
+ jns L(top)
+
+
+ C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %cl
+ jz L(finish_not_two)
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq %mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jz L(done)
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000000..b234a1628c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,147 @@
+dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
+dnl
+dnl alignment dst/src, A=0mod8 N=4mod8
+dnl A/A A/N N/A N/N
+dnl K7 0.75 1.0 1.0 0.75
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do two loads, or two stores, or a load and a store, in one
+C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
+C however nothing under 0.7 c/l is known.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src, incrementing
+ C ebx scratch
+ C ecx counter
+ C edx dst, incrementing
+ C
+ C this loop is 2 cycles/limb
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ decl %ecx
+ leal 4(%eax), %eax
+ leal 4(%edx), %edx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %eax, %ebx
+ leal -12(%eax,%ecx,4), %eax C src end - 12
+ subl $3, %ecx C size-3
+
+ andl %edx, %ebx
+ leal (%edx,%ecx,4), %edx C dst end - 12
+ negl %ecx
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl (%eax,%ecx,4), %ebx
+ movl %ebx, (%edx,%ecx,4)
+ incl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src end - 12
+ C ebx
+ C ecx counter, negative, limbs
+ C edx dst end - 12
+
+ movq (%eax,%ecx,4), %mm0
+ movq 8(%eax,%ecx,4), %mm1
+ addl $4, %ecx
+ movq %mm0, -16(%edx,%ecx,4)
+ movq %mm1, -16+8(%edx,%ecx,4)
+ ja L(top) C jump no carry and not zero
+
+
+ C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ movq (%eax,%ecx,4), %mm0
+ movq %mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jnz L(done)
+
+ movl 8(%eax), %ebx
+ movl %ebx, 8(%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000000..483ad6a9a1
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,718 @@
+dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
+dnl
+dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s. "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code. "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C mpn_divrem_1 avoids one division if the src high limb is less than the
+C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
+C circumstances that will be a very rare event.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_CARRY, 24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi
+ jmp LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+ C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ orl %ecx, %ecx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ jz L(no_skip_div)
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C one less div if high<divisor
+ jnb L(no_skip_div)
+
+ movl $0, (%edi,%ecx,4) C dst high limb
+ decl %ecx C size-1
+ movl %eax, %edx C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ movl %edx, %eax
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ leal 12(%edi), %ebx
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ebx, VAR_DST_STOP
+
+ movl %ecx, %ebx C size
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ orl %ebx, %ebx C size
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ jz L(start_zero)
+ movl %eax, VAR_SRC
+ cmpl $1, %ebx
+
+ movl 8(%eax), %esi C src high limb
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ movl %eax, VAR_SRC
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ shll %cl, %edi C n2 = carry << l
+ movl $0, %esi C n10 = 0
+
+ C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+ C must have xsize!=0
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution. The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C cycles
+C n2+n1 1
+C mul 6
+C q1+1 1
+C mul 6
+C sub 1
+C addback 1
+C ---
+C 16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing. Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE). This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it. In this case
+C there's 19 cycles shown, but it executes at 17.
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl VAR_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ movl VAR_DST, %ecx
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+
+ leal -4(%ecx), %ecx
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %eax, %ecx
+
+ movl %ebx, (%ecx)
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ movl VAR_DST_STOP, %ecx
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+L(integer_none):
+ cmpl $0, PARAM_XSIZE
+ jne L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ movl %ecx, VAR_DST
+
+ movd %mm0, %esi C next n10
+
+ movl $-1, (%ecx)
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles. The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C = m*d + b*d - m - b
+C = floor((b(b-d)-1)/d)*d + b*d - m - b
+C = rnd(b(b-d)-1) + b*d - m - b
+C = rnd(b(b-d)-1 + b*d) - m - b
+C = rnd(b*b-1) - m - b
+C <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution. There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx
+ movl %edi, %eax
+
+ subl $8, %ecx
+
+ jmp L(fraction_entry)
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2 carry, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi (will be n2)
+ C ebp divisor
+
+ movl %ebx, (%ecx) C previous q
+ movl %eax, %edi C remainder->n2
+
+L(fraction_entry):
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ C
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %esi, %ecx
+
+ jne L(fraction_top)
+
+
+ movl %ebx, (%ecx)
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000000..4d17c881ec
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,472 @@
+dnl AMD K7 mpn_lshift -- mpn left shift.
+dnl
+dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx
+
+ shldl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shll %cl, %edx
+
+ movl %edx, (%edi)
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6
+ movd (%edx,%eax,4), %mm5 C src high limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ negl %ecx
+ movd (%edx), %mm4 C src low limb
+
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+
+L(simple_top):
+ C eax loop counter, limbs
+ C ebx
+ C ecx
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src low limb
+ C mm5 src high limb
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%edx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq %mm6, %mm5
+ psllq %mm6, %mm4
+
+ psrlq $32, %mm5
+ movd %mm4, (%edi) C dst low limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx (saved)
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src high limb, for return value
+ C mm6 lshift
+
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+ leal -4(%edx,%eax,4), %edx C &src[size-2]
+
+ testb $4, %dl
+ movq (%edx), %mm1 C src high qword
+
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb (marked xxx) separately to
+ C make it so
+ C
+ C source -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest -4(edi,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ psllq %mm6, %mm1
+ subl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ psrlq $32, %mm1
+ decl %eax C size-2 is new size-1
+
+ movd %mm1, 4(%edi,%eax,4)
+ movq (%edx), %mm1 C new src high qword
+L(start_src_aligned):
+
+
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+ psllq %mm6, %mm5
+
+ testl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+ C shift is 32 bits extra. High limb of dst (marked xxx) handled
+ C here separately.
+ C
+ C source %edx
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest %edi
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ psllq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ psrlq $32, %mm1
+
+ movd %mm1, 4(%edi)
+ movq %mm0, %mm1
+ subl $4, %edi
+
+ movd %ecx, %mm6 C new lshift
+L(start_dst_aligned):
+
+ decl %eax C size-2, two last limbs handled at end
+ movq %mm1, %mm2 C copy of src high qword
+ negl %ecx
+
+ andl $-2, %eax C round size down to even
+ addl $64, %ecx
+
+ movl %eax, %ebx
+ negl %eax
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C rshift = 64-lshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax size (for use at end)
+ C ebx loop counter
+ C ecx rshift
+ C edx src
+ C esi computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating, mm2 first)
+ C mm2 /
+ C mm6 lshift
+ C mm7 rshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 - 8))
+
+ movq disp0(%edx), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+ movq %mm0, disp0(%edi)
+
+
+ movq disp1(%edx), %mm0
+ psllq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm1, %mm0
+ movq %mm0, disp1(%edi)
+')
+
+ subl $UNROLL_BYTES, %edx
+ subl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+ testb $1, %al
+ movl SAVE_EBX, %ebx
+ psllq %mm6, %mm2 C wanted left shifted in all cases below
+
+ movd %mm5, %eax
+
+ movl SAVE_ESI, %esi
+ jz L(end_even)
+
+
+L(end_odd):
+
+ C Size odd, destination was aligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+-------+
+ C | written | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at (%edi), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+ movd disp(4) (%edx), %mm0
+ testb $32, %cl
+
+ movq %mm0, %mm1
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+ psllq %mm6, %mm1
+
+ por %mm2, %mm0
+
+ movq %mm0, disp(0) (%edi)
+ jz L(end_odd_unaligned)
+ movd %mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi+4
+ C --+---------------+-------+
+ C | written | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movq for the aligned case overwrites the movd for the
+ C unaligned case.
+
+ movq %mm2, %mm0
+ psrlq $32, %mm2
+
+ testb $32, %cl
+ movd %mm2, disp(4) (%edi)
+
+ jz L(end_even_unaligned)
+ movq %mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..545ca56ddf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,457 @@
+dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
+dnl
+dnl K7: 17.0 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ jmp LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C carry flag if high<divisor
+
+ cmovc( %eax, %edx) C src high limb as initial carry
+ sbbl $0, %ecx C size-1 to skip one div
+ jz L(divide_done)
+
+
+ ALIGN(16)
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ movl %edx, %eax
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ movl %ebx, SAVE_EBX
+ leal -4(%esi), %ebx
+
+ movl %ebx, VAR_SRC_STOP
+ movl %edi, SAVE_EDI
+
+ movl %ecx, %ebx C size
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ C
+
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl 8(%eax), %esi C src high limb
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ movl %eax, %ecx C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+ cmpl $2, %ebx
+ je L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa. They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary. Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+ ALIGN(32)
+L(inverse_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx src pointer, decrementing
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SIZE, %ebx C dummy
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next src limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, PARAM_SIZE C dummy
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ nop C dummy
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+ leal 0(%ecx), %ecx C dummy
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl VAR_SRC_STOP, %eax
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ cmpl %eax, %ecx
+ jne L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx &src[-1]
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src dword)
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd 4(%ecx), %mm0 C src low limb
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+
+C One limb left
+
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movl VAR_NORM, %ecx C for final denorm
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ movl SAVE_EBX, %ebx
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ movl %esi, %eax C remainder
+ movl SAVE_ESI, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ leal (%ebp,%eax), %edx
+ movl SAVE_EBP, %ebp
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ movl SAVE_EDI, %edi
+
+ shrl %cl, %eax C denorm remainder
+ addl $STACK_SPACE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx src pointer
+ C edx
+ C esi n10
+ C edi (n2)
+ C ebp divisor
+
+ movl VAR_SRC_STOP, %edx
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi C next n10
+
+ cmpl %ecx, %edx
+ jne L(inverse_top)
+ jmp L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000000..fa7c8c04a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,239 @@
+dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl distance.
+dnl
+dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on
+dnl FreeBSD 3.3 and 3.4 doesn't recognise it.
+
+define(psadbw_mm4_mm0,
+`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
+ `HAVE_TARGET_CPU_pentium3'),1,
+ `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0',
+
+`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
+') C this works enough for the sum of bytes done below, making it
+ C possible to test on an older cpu
+ leal -8(%esp), %esp
+ movq %mm4, (%esp)
+ movq %mm0, %mm4
+forloop(i,1,7,
+` psrlq $ 8, %mm4
+ paddb %mm4, %mm0
+')
+ pushl $ 0
+ pushl $ 0xFF
+ pand (%esp), %mm0
+ movq 8(%esp), %mm4
+ leal 16(%esp), %esp
+')')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code. The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ DATA
+ ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+')
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ orl %ecx, %ecx
+ jz L(zero)
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+
+',`
+ movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq LS(rodata_3333333333333333), %mm6
+ movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+ pxor %mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+ movd (%eax,%ecx,8), %mm1
+
+HAM(` movd 0(%edx,%ecx,8), %mm0
+ pxor %mm0, %mm1
+')
+ orl %ecx, %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+ decl %ecx
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+
+ psubd %mm1, %mm0 C bit pairs
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+
+ psadbw_mm4_mm0
+
+ paddd %mm0, %mm2 C add to total
+ jnz L(top)
+
+
+ movd %mm2, %eax
+ emms
+ ret
+
+
+L(zero):
+ movl $0, %eax
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000000..abb546cd5b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,471 @@
+dnl AMD K7 mpn_rshift -- mpn right shift.
+dnl
+dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx C src limb
+
+ shrdl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shrl %cl, %edx
+
+ movl %edx, (%edi) C dst limb
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6 C rshift
+ movd (%edx), %mm5 C src low limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ leal (%edx,%eax,4), %edx C &src[size-1]
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+
+ movd (%edx), %mm4 C src high limb
+ negl %eax
+
+
+L(simple_top):
+ C eax loop counter, limbs, negative
+ C ebx
+ C ecx shift
+ C edx carry
+ C edx &src[size-1]
+ C edi &dst[size-2]
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src high limb
+ C mm5 src low limb
+ C mm6 shift
+
+ movq (%edx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq $32, %mm5
+ psrlq %mm6, %mm4
+
+ psrlq %mm6, %mm5
+ movd %mm4, 4(%edi) C dst high limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src low limb
+ C mm6 rshift
+
+ testb $4, %dl
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edi
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%edx), %mm0 C src low two limbs
+ addl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ addl $4, %edi
+ decl %eax C size-2 is new size-1
+
+ psrlq %mm6, %mm0
+ movl %edi, PARAM_DST C new dst
+
+ movd %mm0, -4(%edi)
+L(start_src_aligned):
+
+
+ movq (%edx), %mm1 C src low two limbs
+ decl %eax C size-2, two last limbs handled at end
+ testl $4, %edi
+
+ psrlq %mm6, %mm5
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
+ C
+ C source edx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edi
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ psrlq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ movd %mm1, (%edi)
+ movq %mm0, %mm1
+ addl $4, %edi C new dst
+
+ movd %ecx, %mm6
+L(start_dst_aligned):
+
+
+ movq %mm1, %mm2 C copy of src low two limbs
+ negl %ecx
+ andl $-2, %eax C round size down to even
+
+ movl %eax, %ebx
+ negl %eax
+ addl $64, %ecx
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C lshift = 64-rshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+ negl %eax
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ negl %eax
+
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(64)
+L(top):
+ C eax size, for use at end
+ C ebx loop counter
+ C ecx lshift
+ C edx src
+ C esi was computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating)
+ C mm2 /
+ C mm6 rshift
+ C mm7 lshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 8))
+
+ movq disp0(%edx), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm2, %mm0
+ movq %mm0, disp0(%edi)
+
+
+ movq disp1(%edx), %mm0
+ psrlq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm1, %mm0
+ movq %mm0, disp1(%edi)
+')
+
+ addl $UNROLL_BYTES, %edx
+ addl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+ testb $1, %al
+ psrlq %mm6, %mm2 C wanted rshifted in all cases below
+ movl SAVE_ESI, %esi
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EBX, %ebx
+ jz L(end_even)
+
+
+ C Size odd, destination was aligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +-------+---------------+---------------+--
+ C | | | written |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | written |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword to store, and in the aligned case there's
+ C a further extra limb of dst to be formed.
+
+
+ movd disp0(%edx), %mm0
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ testb $32, %cl
+
+ por %mm2, %mm0
+ psrlq %mm6, %mm1
+
+ movq %mm0, disp0(%edi)
+ jz L(finish_odd_unaligned)
+
+ movd %mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is the same data as the movq for
+ C the aligned case, it's just a choice between whether one or two
+ C limbs should be written.
+
+
+ testb $32, %cl
+ movd %mm2, disp0(%edi)
+
+ jz L(end_even_unaligned)
+
+ movq %mm2, disp0(%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000000..07f7085b10
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,265 @@
+dnl AMD K7 mpn_mul_1 -- mpn by limb multiply.
+dnl
+dnl K7: 3.4 cycles/limb (at 16 limbs/loop).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 8 3.9
+dnl 16 3.4
+dnl 32 3.4
+dnl 64 3.35
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the destination.
+C
+C Variations on the unrolled loop have been tried, with the current
+C registers or with the counter on the stack to free up ecx. The current
+C code is the fastest found.
+C
+C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
+C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code
+C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
+C without having to change the computed jump. There's obviously something
+C fishy going on, perhaps with what execution units the mul needs.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_EDI, -8)
+defframe(SAVE_ESI, -12)
+defframe(SAVE_EBX, -16)
+deflit(STACK_SPACE, 16)
+
+dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 7)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ jmp LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+ xorl %edx, %edx C initial carry
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME', STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl %ebx, SAVE_EBX
+ movl %edx, %ebx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_DST, %edi
+ movl %ebp, SAVE_EBP
+ jae L(unroll)
+
+ leal (%esi,%ecx,4), %esi
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+
+ movl PARAM_MULTIPLIER, %ebp
+
+L(simple):
+ C eax scratch
+ C ebx carry
+ C ecx counter (negative)
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(simple)
+
+ movl %ebx, %eax
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C The mov to load the next source limb is done well ahead of the mul, this
+C is necessary for full speed. It leads to one limb handled separately
+C after the loop.
+C
+C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
+C to avoid having an 0x80 displacement in the code for the last limb in the
+C unrolled loop. This is for a fair comparison between 16 and 32 unrolling.
+
+ifelse(eval(UNROLL_COUNT >= 32),1,`
+deflit(SRC_OFFSET,4)
+',`
+deflit(SRC_OFFSET,)
+')
+
+ C this is offset 0x62, so close enough to aligned
+L(unroll):
+ C eax
+ C ebx initial carry
+ C ecx size
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+deflit(`FRAME', STACK_SPACE)
+
+ leal -1(%ecx), %edx C one limb handled at end
+ leal -2(%ecx), %ecx C and ecx is one less than edx
+ movl %ebp, SAVE_EBP
+
+ negl %edx
+ shrl $UNROLL_LOG2, %ecx C unrolled loop counter
+ movl (%esi), %eax C src low limb
+
+ andl $UNROLL_MASK, %edx
+ movl PARAM_DST, %edi
+
+ movl %edx, %ebp
+ shll $4, %edx
+
+ C 17 code bytes per limb
+ifdef(`PIC',`
+ call L(add_eip_to_edx)
+L(here):
+',`
+ leal L(entry) (%edx,%ebp), %edx
+')
+ negl %ebp
+
+ leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
+ movl PARAM_MULTIPLIER, %ebp
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(add_eip_to_edx):
+ C See README.family about old gas bugs
+ leal (%edx,%ebp), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret
+')
+
+
+C ----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax next src limb
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi src+4
+ C edi dst
+ C ebp multiplier
+ C
+ C 17 code bytes per limb processed
+
+L(entry):
+forloop(i, 0, UNROLL_COUNT-1, `
+ deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
+
+ mull %ebp
+
+ addl %eax, %ebx
+Zdisp( movl, disp_src,(%esi), %eax)
+Zdisp( movl, %ebx, disp_dst,(%edi))
+
+ movl $0, %ebx
+ adcl %edx, %ebx
+')
+
+ decl %ecx
+
+ leal UNROLL_BYTES(%esi), %esi
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+
+ mull %ebp
+
+ addl %eax, %ebx
+ movl $0, %eax
+ movl SAVE_ESI, %esi
+
+ movl %ebx, disp0(%edi)
+ movl SAVE_EBX, %ebx
+ movl SAVE_EDI, %edi
+
+ adcl %edx, %eax
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000000..c4be62e633
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,593 @@
+dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+dnl
+dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+dnl limbs/loop unrolling).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl 8 4.67
+dnl 16 4.59
+dnl 32 4.42
+dnl Maximum possible with the current code is 32.
+dnl
+dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl done with a straight run through a block of code, no inner loop. Using
+dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once. The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ mull (%edx)
+
+ movl PARAM_WP, %ecx
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+ decl PARAM_YSIZE
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+ movl %eax, %ecx C yp low limb
+
+ movl PARAM_WP, %ebx
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ movl %edx, %esi C xp
+
+ movl (%edx), %eax C xp low limb
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+
+ movl %esi, 4(%ebx)
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+
+dnl FRAME carries on from previous
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi defframe_pushl(`SAVE_EDI')
+ movl %edx, %edi C carry, for wp[1]
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+ movl 4(%ecx), %ecx C yp[1]
+ movl %edi, 4(%ebx)
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+ movl (%esi), %eax C xp[0]
+
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ movl %edi, 8(%ebx)
+
+ adcl $0, %esi
+ movl SAVE_EDI, %edi
+ movl %esi, 12(%ebx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+
+dnl FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+ subl $16, %esp
+deflit(`FRAME',16)
+
+ movl %edi, SAVE_EDI
+ movl PARAM_WP, %edi
+
+ movl %ebx, SAVE_EBX
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+
+ movl %esi, SAVE_ESI
+ xorl %ebx, %ebx
+ leal (%edx,%ecx,4), %esi C xp end
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi xp end
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl PARAM_XSIZE, %ecx
+
+ movl %ebx, (%edi) C final carry
+ decl %edx
+
+ jnz L(ysize_more_than_one)
+
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_EBP, %ebp
+ movl SAVE_ESI, %esi
+ addl $FRAME, %esp
+
+ ret
+
+
+L(ysize_more_than_one):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+ C simple addmul looping
+ C
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ leal 4(%eax,%edx,4), %ebp C yp end
+ negl %ecx
+ negl %edx
+
+ movl (%esi,%ecx,4), %eax C xp low limb
+ movl %edx, PARAM_YSIZE C -(ysize-1)
+ incl %ecx
+
+ xorl %ebx, %ebx C initial carry
+ movl %ecx, PARAM_XSIZE C -(xsize-1)
+ movl %ebp, PARAM_YP
+
+ movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
+ jmp L(simple_outer_entry)
+
+
+ C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):
+ C ebp ysize counter, negative
+
+ movl PARAM_YP, %edx
+ movl PARAM_XSIZE, %ecx C -(xsize-1)
+ xorl %ebx, %ebx C carry
+
+ movl %ebp, PARAM_YSIZE
+ addl $4, %edi C next position in wp
+
+ movl (%edx,%ebp,4), %ebp C yp limb - multiplier
+ movl -4(%esi,%ecx,4), %eax C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+ C eax xp limb
+ C ebx carry limb
+ C ecx loop counter (negative)
+ C edx scratch
+ C esi xp end
+ C edi wp end
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ebx
+ adcl $0, %edx
+
+ addl %ebx, (%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %edx
+
+ incl %ecx
+ movl %edx, %ebx
+ jnz L(simple_inner)
+
+
+ mull %ebp
+
+ movl PARAM_YSIZE, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ addl %ebx, (%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, 4(%edi)
+ jnz L(simple_outer_top)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp. This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_ADJUST, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ movl PARAM_XP, %esi
+ movl 4(%eax), %ebp C multiplier (yp second limb)
+ leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
+
+ movl PARAM_WP, %edi
+ movl %eax, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
+ decl %ecx C xsize-1
+
+ movl (%esi), %eax C xp low limb
+ andl $-UNROLL_MASK-1, %ebx
+ negl %ecx
+
+ subl $VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+ negl %ebx
+ andl $UNROLL_MASK, %ecx
+
+ movl %ebx, VAR_ADJUST
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ sarl $UNROLL_LOG2, %ebx
+
+ C 17 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+ negl %edx
+
+ movl %eax, VAR_XP_LOW
+ movl %ecx, VAR_JMP
+ leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
+ leal 4(%esi,%edx,4), %esi C and start at second limb
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%ecx,%edx,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret
+')
+
+
+C --------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_outer_top):
+ C ebp ysize counter, negative
+
+ movl VAR_ADJUST, %ebx
+ movl PARAM_YP, %edx
+
+ movl VAR_XP_LOW, %eax
+ movl %ebp, PARAM_YSIZE C store incremented ysize counter
+
+ leal 4(%edi,%ebx,4), %edi
+ leal (%esi,%ebx,4), %esi
+ sarl $UNROLL_LOG2, %ebx
+
+ movl (%edx,%ebp,4), %ebp C yp next multiplier
+ movl VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ testb $1, %cl C and clear carry bit
+ movl %ebx, VAR_COUNTER
+ movl $0, %ebx
+
+ movl $0, %ecx
+ cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
+ cmovnz( %eax, %ebx)
+
+ C Extra fetch of VAR_JMP is bad, but registers are tight
+ jmp *VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax xp limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi xp+8
+ C edi wp
+ C ebp yp multiplier limb
+ C
+ C VAR_COUNTER loop counter, negative
+ C
+ C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp0,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+
+ movl disp1(%esi), %eax
+ adcl %edx, %ecx
+
+ mull %ebp
+
+ addl %ebx, disp1(%edi)
+ movl $0, %ebx
+
+ adcl %eax, %ecx
+')
+
+
+ incl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ leal UNROLL_BYTES(%edi), %edi
+
+ jnz L(unroll_top)
+
+
+ C eax
+ C ebx zero
+ C ecx low
+ C edx high
+ C esi
+ C edi wp, pointing at second last limb)
+ C ebp
+ C
+ C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+ movl PARAM_YSIZE, %ebp
+ adcl $0, %edx
+ addl %ecx, disp0(%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, disp1(%edi)
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000000..84861ea66b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,627 @@
+dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+dnl (measured on the speed difference between 25 and 50 limbs, which is
+dnl roughly the Karatsuba recursing range).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl some comments.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ cmpl $2, %ecx
+
+ movl PARAM_DST, %edx
+ je L(two_limbs)
+ ja L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %edx, 4(%ecx)
+ movl %eax, (%ecx)
+ ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers. Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+deflit(`FRAME',0)
+
+ pushl %ebx FRAME_pushl()
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx) C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx) C dst[2]
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx) C dst[3]
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ popl %ebx
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+ subl $STACK_SPACE, %esp
+ cmpl $4, %ecx
+ jae L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl %ebx, SAVE_EBX
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+ movl %edx, 12(%ecx)
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+ movl %edx, 20(%ecx)
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+ movl %edx, %edi
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %ebp, SAVE_EBP
+ movl $0, %ebp
+
+ movl 4(%ebx), %eax
+ adcl %edx, %ebp
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+ movl 4(%ecx), %eax
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+
+ adcl $0, %ebx
+ addl %eax, %esi
+ movl 8(%ecx), %eax
+
+ adcl %eax, %edi
+ movl 12(%ecx), %eax
+ movl %esi, 4(%ecx)
+
+ adcl %eax, %ebp
+ movl 16(%ecx), %eax
+ movl %edi, 8(%ecx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+
+ adcl %eax, %edx
+ movl 20(%ecx), %eax
+ movl %ebp, 12(%ecx)
+
+ adcl %ebx, %eax
+ ASSERT(nc)
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 16(%ecx)
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP', -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+ movl %ebx, SAVE_EBX
+ movl %edi, SAVE_EDI
+ leal (%edx,%ecx,4), %edi C &dst[size]
+
+ movl %esi, SAVE_ESI
+ movl %ebp, SAVE_EBP
+ leal (%eax,%ecx,4), %esi C &src[size]
+
+ movl (%eax), %ebp C multiplier
+ movl $0, %ebx
+ decl %ecx
+
+ negl %ecx
+ subl $EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time. There seems no way to avoid this.
+
+dnl This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl Because the last chunk of code is generated differently, a label placed
+dnl at the end doesn't work. Instead calculate the implied end using the
+dnl start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ negl %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+ C The calculated jump mustn't come out to before the start of the
+ C code available. This is the limit UNROLL_COUNT puts on the src
+ C operand size, but checked here directly using the jump address.
+ ASSERT(ae,
+ `movl_text_address(L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
+ movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
+
+ movl %edx, VAR_COUNTER
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+ movl %edx, %ebx C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+
+ leal CODE_BYTES_PER_LIMB(%edx), %eax
+ xorl %edx, %edx
+ leal 4(%edi), %edi
+
+ movl %eax, VAR_JMP
+
+ jmp *%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+ addl %edx, %ecx
+ ret
+')
+
+
+ C Must be an even address to preserve the significance of the low
+ C bit of the jump address indicating which way around ecx/ebx should
+ C start.
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+',`
+ dnl this bit comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ecx
+
+ mull %ebp
+
+dnl Zdisp( addl %ebx, disp_src,(%edi))
+ addl %ebx, disp_dst(%edi)
+ifelse(forloop_last,0,
+` movl $0, %ebx')
+
+ adcl %eax, %ecx
+')
+')
+
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ adcl $0, %edx
+ addl %ecx, -4+OFFSET(%edi)
+ movl VAR_JMP, %ecx
+
+ adcl $0, %edx
+
+ movl %edx, m4_empty_if_zero(OFFSET) (%edi)
+ movl VAR_COUNTER, %edx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%esi), %ebp
+ movl -8(%esi), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ movl -4(%esi), %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+ movl %eax, %esi
+
+ mull %ebp
+
+ addl %ebx, %eax
+
+ adcl $0, %edx
+ addl %eax, (%edi)
+ movl %esi, %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+
+ mull %ecx
+
+ addl %ebx, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+ movl %edx, 8(%edi)
+
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edi
+ xorl %ecx, %ecx C clear carry
+
+ leal (%edi,%eax,8), %edi
+ notl %eax C -size-1, preserve carry
+
+ leal 2(%eax), %eax C -(size-1)
+
+L(lshift):
+ C eax counter, negative
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi dst, pointing just after last limb
+ C ebp
+
+ rcll -4(%edi,%eax,8)
+ rcll (%edi,%eax,8)
+ incl %eax
+ jnz L(lshift)
+
+ setc %al
+
+ movl PARAM_SRC, %esi
+ movl %eax, -4(%edi) C dst most significant limb
+
+ movl PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+
+ mull %eax
+
+ leal (%esi,%ecx,4), %esi C src point just after last limb
+ negl %ecx
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+ incl %ecx
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi src just after last limb
+ C edi dst just after last limb
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, -4(%edi,%ecx,8)
+ adcl %eax, (%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, -4(%edi) C dst most significant limb
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000000..4735335cbe
--- /dev/null
+++ b/rts/gmp/mpn/x86/lshift.asm
@@ -0,0 +1,90 @@
+dnl x86 mpn_lshift -- mpn left shift.
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+deflit(`FRAME',12)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%edx
+ movl PARAM_SHIFT,%ecx
+
+ subl $4,%esi C adjust src
+
+ movl (%esi,%edx,4),%ebx C read most significant limb
+ xorl %eax,%eax
+ shldl( %cl, %ebx, %eax) C compute carry limb
+ decl %edx
+ jz L(end)
+ pushl %eax C push carry limb onto stack
+ testb $1,%dl
+ jnz L(1) C enter loop in the middle
+ movl %ebx,%eax
+
+ ALIGN(8)
+L(oop): movl (%esi,%edx,4),%ebx C load next lower limb
+ shldl( %cl, %ebx, %eax) C compute result limb
+ movl %eax,(%edi,%edx,4) C store it
+ decl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shldl( %cl, %eax, %ebx)
+ movl %ebx,(%edi,%edx,4)
+ decl %edx
+ jnz L(oop)
+
+ shll %cl,%eax C compute least significant limb
+ movl %eax,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+L(end): shll %cl,%ebx C compute least significant limb
+ movl %ebx,(%edi) C store it
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm
new file mode 100644
index 0000000000..3908161b3e
--- /dev/null
+++ b/rts/gmp/mpn/x86/mod_1.asm
@@ -0,0 +1,141 @@
+dnl x86 mpn_mod_1 -- mpn by limb remainder.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl cycles/limb
+dnl K6 20
+dnl P5 44
+dnl P6 39
+dnl 486 approx 42 maybe
+dnl
+dnl The following have their own optimized mod_1 implementations, but for
+dnl reference the code here runs as follows.
+dnl
+dnl P6MMX 39
+dnl K7 41
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C Divide src,size by divisor and return the remainder. The quotient is
+C discarded.
+C
+C See mpn/x86/divrem_1.asm for some comments.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ .text
+ ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx, %ecx
+
+ movl PARAM_CARRY, %edx
+ jnz LF(mpn_mod_1,top)
+
+ popl %esi
+ movl %edx, %eax
+
+ popl %ebx
+
+ ret
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %esi FRAME_pushl()
+
+ orl %ecx, %ecx
+ jz L(done_zero)
+
+ movl PARAM_DIVISOR, %esi
+ movl -4(%ebx,%ecx,4), %eax C src high limb
+
+ cmpl %esi, %eax
+
+ sbbl %edx, %edx C -1 if high<divisor
+
+ addl %edx, %ecx C skip one division if high<divisor
+ jz L(done_eax)
+
+ andl %eax, %edx C carry if high<divisor
+
+
+L(top):
+ C eax scratch (quotient)
+ C ebx src
+ C ecx counter
+ C edx carry (remainder)
+ C esi divisor
+ C edi
+ C ebp
+
+ movl -4(%ebx,%ecx,4), %eax
+
+ divl %esi
+
+ loop_or_decljnz L(top)
+
+
+ movl %edx, %eax
+L(done_eax):
+ popl %esi
+
+ popl %ebx
+
+ ret
+
+
+L(done_zero):
+ popl %esi
+ xorl %eax, %eax
+
+ popl %ebx
+
+ ret
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000000..8817f291bc
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_1.asm
@@ -0,0 +1,130 @@
+dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl with a limb and store the result in a second limb vector.
+dnl
+dnl cycles/limb
+dnl P6: 5.5
+dnl
+dnl The following CPUs have their own optimized code, but for reference the
+dnl code here runs as follows.
+dnl
+dnl cycles/limb
+dnl P5: 12.5
+dnl K6: 10.5
+dnl K7: 4.5
+
+
+dnl Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ecx
+
+ xorl %ebx,%ebx
+ andl $3,%ecx
+ jz L(end0)
+
+L(oop0):
+ movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ leal 4(%esi),%esi
+ addl %ebx,%eax
+ movl $0,%ebx
+ adcl %ebx,%edx
+ movl %eax,(%edi)
+ movl %edx,%ebx C propagate carry into cylimb
+
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oop0)
+
+L(end0):
+ movl PARAM_SIZE,%ecx
+ shrl $2,%ecx
+ jz L(end)
+
+
+ ALIGN(8)
+L(oop): movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ addl %eax,%ebx
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 4(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebx,(%edi)
+ addl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl 8(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebp,4(%edi)
+ addl %eax,%ebx C new lo + cylimb
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 12(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebx,8(%edi)
+ addl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl %ebp,12(%edi)
+
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ecx
+ jnz L(oop)
+
+L(end): movl %ebx,%eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000000..3a9b73895b
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_basecase.asm
@@ -0,0 +1,209 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl in a third limb vector.
+
+
+dnl Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II. This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4. That
+C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER, -8)
+deflit(VAR_STACK_SPACE, 8)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ subl $VAR_STACK_SPACE,%esp
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+
+ leal 4(%esi),%esi
+ leal 4(%edi),%edi
+
+L(oopM):
+ movl (%esi),%eax C load next limb at xp[j]
+ leal 4(%esi),%esi
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %edx,%ebx
+ adcl $0,%ebx
+ movl %eax,(%edi)
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oopM)
+
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ movl PARAM_XSIZE,%eax
+ shll $2,%eax
+ subl %eax,%edi
+ subl %eax,%esi
+
+ movl PARAM_YSIZE,%eax C ysize
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ movl PARAM_YP,%ebp C yp
+ addl $4,%ebp C make ebp point to next v limb
+ movl %ebp,PARAM_YP
+ movl (%ebp),%eax C copy y limb ...
+ movl %eax,VAR_MULTIPLIER C ... to stack slot
+ movl PARAM_XSIZE,%ecx
+
+ xorl %ebx,%ebx
+ andl $3,%ecx
+ jz L(end0)
+
+L(oop0):
+ movl (%esi),%eax
+ mull VAR_MULTIPLIER
+ leal 4(%esi),%esi
+ addl %ebx,%eax
+ movl $0,%ebx
+ adcl %ebx,%edx
+ addl %eax,(%edi)
+ adcl %edx,%ebx C propagate carry into cylimb
+
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oop0)
+
+L(end0):
+ movl PARAM_XSIZE,%ecx
+ shrl $2,%ecx
+ jz L(endX)
+
+ ALIGN(8)
+L(oopX):
+ movl (%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %eax,%ebx
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 4(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebx,(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl 8(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebp,4(%edi)
+ adcl %eax,%ebx C new lo + cylimb
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 12(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebx,8(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ addl %ebp,12(%edi)
+ adcl $0,%ebx C propagate carry into cylimb
+
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ecx
+ jnz L(oopX)
+
+L(endX):
+ movl %ebx,(%edi)
+ addl $4,%edi
+
+ C we incremented wp and xp in the loop above; compensate
+ movl PARAM_XSIZE,%eax
+ shll $2,%eax
+ subl %eax,%edi
+ subl %eax,%esi
+
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $8,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $8,%esp
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000000..7dbc905a0d
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/README
@@ -0,0 +1,95 @@
+
+ INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.7
+
+ mpn_copyi 0.75
+ mpn_copyd 2.4
+
+ mpn_divrem_1 39.0
+ mpn_mod_1 39.0
+ mpn_divexact_by3 8.5
+
+ mpn_mul_1 5.5
+ mpn_addmul/submul_1 6.35
+
+ mpn_l/rshift 2.5
+
+ mpn_mul_basecase 8.2 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.0 cycles/crossproduct (approx)
+ or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+ mpn_divrem_1 25.0 integer part, 17.5 fractional part
+ mpn_mod_1 24.0
+
+ mpn_l/rshift 1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone. The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000000..feb364ec0b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,300 @@
+dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl
+dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl P6 UNROLL_COUNT cycles/limb
+dnl 8 6.7
+dnl 16 6.35
+dnl 32 6.3
+dnl 64 6.3
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code. The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %ebx
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %ebx
+ jmp LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %ebx
+deflit(`FRAME',4)
+ xorl %ebx, %ebx C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %esi
+ pushl %edi
+deflit(`FRAME',12)
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+deflit(`FRAME',16)
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_MULTIPLIER, %ebp
+ jae L(unroll)
+
+
+ C simple loop
+ C this is offset 0x22, so close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ movl (%esi), %eax
+ addl $4, %edi
+
+ mull %ebp
+
+ addl %ebx, %eax
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ decl %ecx
+
+ leal 4(%esi), %esi
+ jnz L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ movl %ebx, %eax
+
+ popl %ebx
+ ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points. Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+ C this is offset 0x43, so close enough to aligned
+L(unroll):
+ C eax
+ C ebx initial carry
+ C ecx size
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ shll $4, %edx
+ negl %ecx
+
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%esi), %eax C src low limb
+
+ movl %edx, VAR_JUMP
+ leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+ mull %ebp
+
+ addl %ebx, %eax C initial carry (from _1c)
+ adcl $0, %edx
+
+ movl %edx, %ebx C high carry
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+ movl VAR_JUMP, %edx
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ cmovnz( %ebx, %ecx) C high,low carry other way around
+ cmovnz( %eax, %ebx)
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ shll $4, %edx
+ negl %ecx
+
+ C See README.family about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+
+ addl (%esp), %edx
+
+ ret
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx carry hi
+ C ecx carry lo
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+ C
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes per limb
+
+ addl $UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+
+ movl disp1(%esi), %eax
+ mull %ebp
+ M4_inst %ebx, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+
+ jns L(top)
+
+
+deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+ M4_inst %ecx, disp0(%edi)
+ movl %ebx, %eax
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+ adcl $0, %eax
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm
new file mode 100644
index 0000000000..a77703ea89
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/diveby3.asm
@@ -0,0 +1,37 @@
+dnl Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl P6: 8.5 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl The P5 code runs well on P6, in fact better than anything else found so
+dnl far. An imul is 4 cycles, meaning the two cmp/sbbl pairs on the
+dnl dependent path are taking 4.5 cycles.
+dnl
+dnl The destination cache line prefetching is unnecessary on P6, but
+dnl removing it is a 2 cycle slowdown (approx), so it must be inducing
+dnl something good in the out of order execution.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_by3c)
+include_mpn(`x86/pentium/diveby3.asm')
diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000000..d7bfb6d60c
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,96 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 5 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 39 /* cycles */
+#endif
+
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 2 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 23
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 139
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 166
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 116
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 66
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 20
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 608
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 5888
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 656, 1504, 2944, 6656, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 672
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 5888
+#endif
diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000000..f1b011b623
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,677 @@
+dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+dnl
+dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments. It's likely what's here can be improved.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The different speeds of the integer and fraction parts means that using
+dnl xsize+size isn't quite right. The threshold wants to be a bit higher
+dnl for the integer part and a bit lower for the fraction part. (Or what's
+dnl really wanted is to speed up the integer part!)
+dnl
+dnl The threshold is set to make the integer part right. At 4 limbs the
+dnl div and mul are about the same there, but on the fractional part the
+dnl mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY, 24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ .text
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi
+ jmp LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+ C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ orl %ecx, %ecx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jz L(no_skip_div)
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+ cmpl %ebp, %eax C one less div if high<divisor
+ jnb L(no_skip_div)
+
+ movl $0, (%edi,%ecx,4) C dst high limb
+ decl %ecx C size-1
+ movl %eax, %edx C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ movl %edx, %eax
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal 12(%edi), %ebx
+
+ movl %ebx, VAR_DST_STOP
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ecx, %ebx C size
+
+ bsrl %ebp, %ecx C 31-l
+ movl %edx, %edi C carry
+
+ leal 1(%ecx), %eax C 32-l
+ xorl $31, %ecx C l
+
+ movl %ecx, VAR_NORM
+ movl $-1, %edx
+
+ shll %cl, %ebp C d normalized
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ movl %eax, VAR_INVERSE
+ orl %ebx, %ebx C size
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl %eax, VAR_SRC
+ jz L(start_zero)
+
+ movl 8(%eax), %esi C src high limb
+ cmpl $1, %ebx
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ shll %cl, %edi C n2 = carry << l
+ movl $0, %esi C n10 = 0
+
+ C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+ C must have xsize!=0
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest. A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C n2+n1 1 (addl)
+C mul 5
+C q1+1 3 (addl/adcl)
+C mul 5
+C sub 3 (subl/sbbl)
+C addback 2 (cmov)
+C ---
+C 19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp d
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+ movq (%ecx), %mm0 C next src limb and the one below it
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ movl %ebp, %eax C d
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+
+ mull %ebx C (q1+1)*d
+
+ movl VAR_DST, %ecx
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+ subl $4, %ecx
+
+ movl %ebx, (%ecx)
+ cmpl %eax, %ecx
+
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl PARAM_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_DST_STOP, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl PARAM_XSIZE, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+ orl %eax, %eax C xsize
+ jnz L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ movl %ecx, VAR_DST
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+
+ movl $-1, (%ecx)
+ movd %mm0, %esi C next n10
+
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C mul 5
+C q1+1 1 (addl)
+C mul 5
+C sub 3 (negl/sbbl)
+C addback 2 (cmov)
+C ---
+C 16
+C
+C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx
+ movl %edi, %eax
+
+ subl $8, %ecx
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi n2
+ C ebp divisor
+
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+ movl %eax, %edi C remainder->n2
+ cmpl %esi, %ecx
+
+ movl %ebx, (%ecx) C previous q
+ jne L(fraction_top)
+
+
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
new file mode 100644
index 0000000000..e7d8d94d33
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
@@ -0,0 +1,444 @@
+dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
+dnl
+dnl P6MMX: 24.0 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C The code here very similar to mpn_divrem_1, but with the quotient
+C discarded. What's here probably isn't optimal.
+C
+C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
+C comments.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+ .text
+ ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ jmp LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl $0, %edx C initial carry (if can't skip a div)
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C carry flag if high<divisor
+
+ cmovc( %eax, %edx) C src high limb as initial carry
+ sbbl $0, %ecx C size-1 to skip one div
+ jz L(divide_done)
+
+
+ ALIGN(16)
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl %edx, %eax
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ movl %ebx, SAVE_EBX
+ leal -4(%esi), %ebx
+
+ movl %ebx, VAR_SRC_STOP
+ movl %ecx, %ebx C size
+
+ movl %edi, SAVE_EDI
+ movl %edx, %edi C carry
+
+ bsrl %ebp, %ecx C 31-l
+ movl $-1, %edx
+
+ leal 1(%ecx), %eax C 32-l
+ xorl $31, %ecx C l
+
+ movl %ecx, VAR_NORM
+ shll %cl, %ebp C d normalized
+
+ movd %eax, %mm7
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ C
+
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl 8(%eax), %esi C src high limb
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ movl %eax, %ecx C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+ cmpl $2, %ebx
+ je L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs. This
+C gets it down to 24 c/l, which is still a bit away from a theoretical 19
+C c/l.
+
+ ALIGN(16)
+L(inverse_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx src pointer, decrementing
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next src limb and the one below it
+ subl $4, %ecx
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+ movl VAR_SRC_STOP, %ebx
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+ cmpl %ebx, %ecx
+
+ jne L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx &src[-1]
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src dword)
+ C mm7 rshift
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd 4(%ecx), %mm0 C src low limb
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+ movl %ebp, %eax C d
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+
+C One limb left
+
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movl VAR_NORM, %ecx C for final denorm
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+ movl %ebp, %eax C d
+
+ mull %ebx C (q1+1)*d
+
+ movl SAVE_EBX, %ebx
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ leal (%ebp,%esi), %edx
+ movl SAVE_EBP, %ebp
+
+ movl %esi, %eax C remainder
+ movl SAVE_ESI, %esi
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ movl SAVE_EDI, %edi
+
+ shrl %cl, %eax C denorm remainder
+ addl $STACK_SPACE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx src pointer
+ C edx
+ C esi n10
+ C edi (n2)
+ C ebp divisor
+
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ movl VAR_SRC_STOP, %edx
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi C next n10
+ cmpl %ecx, %edx
+ jne L(inverse_top)
+
+ jmp L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000000..50f9a11218
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,31 @@
+dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+dnl
+dnl P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb
+dnl (approx)
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000000..e63fbf334b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Haven't actually measured it, but the K7 code with the psadbw should be
+dnl good on P-III.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000000..174c78406a
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,641 @@
+dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+dnl product (measured on the speed difference between 20 and 40 limbs,
+dnl which is the Karatsuba recursing range).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl a description. The only difference here is that UNROLL_COUNT can go up
+dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src limb
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl %eax, %esi
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%esi), %eax
+
+ movl %ebx, SAVE_EBX
+ movl %edx, %ebx C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %edi, SAVE_EDI
+ movl %eax, %edi C dst[2]
+ movl (%esi), %eax
+
+ movl %ebp, SAVE_EBP
+ movl %edx, %ebp C dst[3]
+
+ mull 4(%esi) C src[0]*src[1]
+
+ addl %eax, %ebx
+ movl SAVE_ESI, %esi
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %ebx, %eax
+ movl SAVE_EBX, %ebx
+
+ adcl %edi, %edx
+ movl SAVE_EDI, %edi
+
+ adcl $0, %ebp
+
+ movl %eax, 4(%ecx)
+
+ movl %ebp, 12(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 8(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+deflit(`FRAME',0)
+
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ cmpl $4, %edx
+
+ movl PARAM_SRC, %esi
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %ebp defframe_pushl(`SAVE_EBP')
+ pushl %edi defframe_pushl(`SAVE_EDI')
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%esi), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+ movl 8(%esi), %eax
+
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%esi), %eax
+
+ mull 4(%esi) C src[0] * src[1]
+
+ movl %eax, %ebx
+ movl %edx, %edi
+
+ movl (%esi), %eax
+
+ mull 8(%esi) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%esi), %eax
+
+ mull 8(%esi) C src[1] * src[2]
+
+ xorl %esi, %esi
+ addl %eax, %ebp
+
+ C eax
+ C ebx dst[1]
+ C ecx dst
+ C edx dst[4]
+ C esi zero, will be dst[5]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %ebx, %ebx
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %esi
+ addl %ebx, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %ebx
+
+ adcl %ebp, %ebx
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %ebx, 12(%ecx)
+ movl SAVE_EBX, %ebx
+
+ adcl %edx, %edi
+ movl 20(%ecx), %eax
+
+ movl %edi, 16(%ecx)
+ movl SAVE_EDI, %edi
+
+ adcl %esi, %eax C no carry out of this
+ movl SAVE_ESI, %esi
+
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+ C eax src low limb
+ C ebx
+ C ecx
+ C edx size
+ C esi src
+ C edi
+ C ebp
+deflit(`FRAME',4) dnl %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+ subl $STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+ movl $1, %ecx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ subl %edx, %ecx C -(size-1)
+
+ movl %ebp, SAVE_EBP
+ movl $0, %ebx C initial carry
+
+ leal (%esi,%edx,4), %esi C &src[size]
+ movl %eax, %ebp C multiplier
+
+ leal -4(%edi,%edx,4), %edi C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, limbs, negative, -(size-1) to -1
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp multiplier
+
+ movl %ebp, %eax
+
+ mull (%esi,%ecx,4)
+
+ addl %ebx, %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ movl %eax, 4(%edi,%ecx,4)
+
+ incl %ecx
+ jnz L(mul_1)
+
+
+ movl %ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ negl %ecx
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limit that UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+
+ ASSERT(ae,
+ `movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, second highest limb of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+
+ movl %edx, %ebx C high carry
+ leal 4(%edi), %edi
+
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+ movl %edx, VAR_JMP
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/ebx should start.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src pointer
+ C edi dst pointer
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ebx, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+')
+')
+L(unroll_inner_end):
+
+ addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+ movl VAR_COUNTER, %edx
+ adcl $0, %ecx
+
+ movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[2*size-5]
+ C ebp
+
+ movl -12(%esi), %eax
+
+ mull -8(%esi)
+
+ addl %eax, (%edi)
+ movl -12(%esi), %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ addl %eax, %ebx
+ movl -8(%esi), %eax
+
+ adcl $0, %edx
+
+ addl %ebx, 4(%edi)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ movl PARAM_SIZE, %ecx
+ addl %ebx, %eax
+
+ adcl $0, %edx
+
+ movl %eax, 8(%edi)
+
+ movl %edx, 12(%edi)
+ movl PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+ subl $1, %ecx C size-1
+ xorl %eax, %eax C ready for final adcl, and clear carry
+
+ movl %ecx, %edx
+ movl PARAM_SRC, %esi
+
+
+L(lshift):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 1
+ C edx size-1 (for later use)
+ C esi src (for later use)
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+
+ leal 8(%edi), %edi
+ decl %ecx
+ jnz L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%esi), %eax C src[0]
+
+ leal 4(%esi,%edx,4), %esi C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi &src[size]
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000000..3b9ec8ac6f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/README
@@ -0,0 +1,77 @@
+
+ INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors. The mmx subdirectory has code for Pentium with MMX (P55).
+
+
+STATUS
+
+ cycles/limb
+
+ mpn_add_n/sub_n 2.375
+
+ mpn_copyi/copyd 1.0
+
+ mpn_divrem_1 44.0
+ mpn_mod_1 44.0
+ mpn_divexact_by3 15.0
+
+ mpn_l/rshift 5.375 normal (6.0 on P54)
+ 1.875 special shift by 1 bit
+
+ mpn_mul_1 13.0
+ mpn_add/submul_1 14.0
+
+ mpn_mul_basecase 14.2 cycles/crossproduct (approx)
+
+ mpn_sqr_basecase 8 cycles/crossproduct (approx)
+ or 15.5 cycles/triangleproduct (approx)
+
+Pentium MMX gets the following improvements
+
+ mpn_l/rshift 1.75
+
+
+1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically. The P55 runs them at the expected speed.
+
+2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
+
+3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should. Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run with it as 9.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. Pentium doesn't allocate cache lines on writes, unlike most other modern
+processors. Since the functions in the mpn class do array writes, we have to
+handle allocating the destination cache lines by reading a word from it in the
+loops, to achieve the best performance.
+
+2. Pairing of memory operations requires that the two issued operations refer
+to different cache banks. The simplest way to insure this is to read/write
+two words from the same object. If we make operations on different objects,
+they might or might not be to the same cache bank.
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is mostly about P5, the parts about P6 aren't relevant. Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000000..a61082a456
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,196 @@
+dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl
+dnl P5: 2.375 cycles/limb
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(endgo)
+
+ pushl %edx
+FRAME_pushl()
+ movl PARAM_CARRY,%eax
+ shrl $1,%eax C shift bit 0 into carry
+ jmp LF(M4_function_n,oop)
+
+L(endgo):
+deflit(`FRAME',16)
+ movl PARAM_CARRY,%eax
+ shrl $1,%eax C shift bit 0 into carry
+ jmp LF(M4_function_n,end)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_n)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(end)
+ pushl %edx
+FRAME_pushl()
+
+ ALIGN(8)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 8(%ebp),%ebx
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 12(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 16(%ebp),%ebx
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 20(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 24(%ebp),%ebx
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 28(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 32(%ebp),%ebx
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebp),%ebp
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+FRAME_popl()
+L(end):
+ decl %edx C test %edx w/o clobbering carry
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebp),%ebp
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000000..147b55610f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,99 @@
+dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+dnl
+dnl P5: 14.0 cycles/limb
+
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(M4_function_1)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST, %edi
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ leal (%edi,%ecx,4), %edi
+ leal (%esi,%ecx,4), %esi
+ negl %ecx
+ xorl %ebx, %ebx
+ ALIGN(8)
+
+L(oop): adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ M4_inst %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(oop)
+
+ adcl $0, %ebx
+ movl %ebx, %eax
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
new file mode 100644
index 0000000000..dbac81642f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/diveby3.asm
@@ -0,0 +1,183 @@
+dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl P5: 15.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL, 0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %ecx
+ movl PARAM_SIZE, %edx
+
+ decl %edx
+ jnz L(two_or_more)
+
+ movl (%ecx), %edx
+ movl PARAM_CARRY, %eax C risk of cache bank clash here
+
+ movl PARAM_DST, %ecx
+ subl %eax, %edx
+
+ sbbl %eax, %eax C 0 or -1
+
+ imull $INVERSE_3, %edx, %edx
+
+ negl %eax C 0 or 1
+ cmpl $ONE_THIRD_CEIL, %edx
+
+ sbbl $-1, %eax C +1 if edx>=ceil(b/3)
+ cmpl $TWO_THIRDS_CEIL, %edx
+
+ sbbl $-1, %eax C +1 if edx>=ceil(b*2/3)
+ movl %edx, (%ecx)
+
+ ret
+
+
+L(two_or_more):
+ C eax
+ C ebx
+ C ecx src
+ C edx size-1
+ C esi
+ C edi
+ C ebp
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ movl PARAM_CARRY, %esi
+
+ movl (%ecx), %eax C src low limb
+ xorl %ebx, %ebx
+
+ sub %esi, %eax
+ movl $TWO_THIRDS_FLOOR, %esi
+
+ leal (%ecx,%edx,4), %ecx C &src[size-1]
+ leal (%edi,%edx,4), %edi C &dst[size-1]
+
+ adcl $0, %ebx C carry, 0 or 1
+ negl %edx C -(size-1)
+
+
+C The loop needs a source limb ready at the top, which leads to one limb
+C handled separately at the end, and the special case above for size==1.
+C There doesn't seem to be any scheduling that would keep the speed but move
+C the source load and carry subtract up to the top.
+C
+C The destination cache line prefetching adds 1 cycle to the loop but is
+C considered worthwhile. The slowdown is a factor of 1.07, but will prevent
+C repeated write-throughs if the destination isn't in L1. A version using
+C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
+C no faster, due to unavoidable branch mispreditions in the inner loop.
+C
+C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx
+C could be avoided then the src limb fetch could pair up and save a cycle.
+C This would probably mean going to a two limb loop with the carry limb
+C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
+C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
+C
+C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
+C "cmpl %edx, $n" with the immediate as the second operand.
+C
+C The "4" source displacement is in the loop rather than the setup because
+C this gets L(top) aligned to 8 bytes at no cost.
+
+ ALIGN(8)
+L(top):
+ C eax source limb, carry subtracted
+ C ebx carry (0 or 1)
+ C ecx &src[size-1]
+ C edx counter, limbs, negative
+ C esi TWO_THIRDS_FLOOR
+ C edi &dst[size-1]
+ C ebp scratch (result limb)
+
+ imull $INVERSE_3, %eax, %ebp
+
+ cmpl $ONE_THIRD_CEIL, %ebp
+ movl (%edi,%edx,4), %eax C dst cache line prefetch
+
+ sbbl $-1, %ebx C +1 if ebp>=ceil(b/3)
+ cmpl %ebp, %esi
+
+ movl 4(%ecx,%edx,4), %eax C next src limb
+
+ sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3)
+ movl $0, %ebx
+
+ adcl $0, %ebx C new carry
+ movl %ebp, (%edi,%edx,4)
+
+ incl %edx
+ jnz L(top)
+
+
+
+ imull $INVERSE_3, %eax, %edx
+
+ cmpl $ONE_THIRD_CEIL, %edx
+ movl %edx, (%edi)
+
+ sbbl $-1, %ebx C +1 if edx>=ceil(b/3)
+ cmpl $TWO_THIRDS_CEIL, %edx
+
+ sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3)
+ popl %ebp
+
+ movl %ebx, %eax
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000000..d3ed3d73ce
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 179
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 153
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 46
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 110
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 3840
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000000..e1e35d4c57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,236 @@
+dnl Intel Pentium mpn_lshift -- mpn left shift.
+dnl
+dnl cycles/limb
+dnl P5,P54: 6.0
+dnl P55: 5.375
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (%esi,%ebp,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ shldl( %cl, %ebx, %edx)
+ shldl( %cl, %eax, %ebx)
+ movl %edx,-8(%edi)
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ shldl( %cl, %edx, %eax)
+ shldl( %cl, %ebx, %edx)
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl( %cl,%eax,%edx)
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx C compute least significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ addl %edx,%edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ adcl %ebx,%ebx
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebx,%ebx
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi C use leal not to clobber carry
+ leal 32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebx,(%edi)
+
+ leal 4(%esi),%esi C use leal not to clobber carry
+ leal 4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000000..2379077d0c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 99
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 40
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 4352
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000000..2225438658
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,455 @@
+dnl Intel P5 mpn_lshift -- mpn left shift.
+dnl
+dnl P5: 1.75 cycles/limb.
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ movl -4(%ebx,%eax,4), %edi C src high limb
+ decl %eax
+
+ jnz L(simple)
+
+ shldl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shll %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx,%eax,4), %mm5 C src high limb
+
+ movd %ecx, %mm6 C lshift
+ negl %ecx
+
+ psllq %mm6, %mm5
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+ psrlq $32, %mm5 C retval
+
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ C
+
+ movd %mm0, 4(%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+
+ movd %mm5, %eax
+ psllq %mm6, %mm0
+
+ popl %edi
+ popl %ebx
+
+ movd %mm0, (%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd -4(%ebx,%eax,4), %mm5 C src high limb
+ leal (%ebx,%eax,4), %edi
+
+ movd %ecx, %mm6 C lshift
+ andl $4, %edi
+
+ psllq %mm6, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb separately (marked xxx) to
+ C make it so.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ movq -8(%ebx,%eax,4), %mm0 C unaligned load
+
+ psllq %mm6, %mm0
+ decl %eax
+
+ psrlq $32, %mm0
+
+ C
+
+ movd %mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+ movq -8(%ebx,%eax,4), %mm1 C src high qword
+ leal (%edx,%eax,4), %edi
+
+ andl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ movq -16(%ebx,%eax,4), %mm3 C src second highest qword
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+ C is 32 bits extra. High limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psllq %mm6, %mm0
+
+ movd %ecx, %mm6
+ psrlq $32, %mm0
+
+ C wasted cycle here waiting for %mm0
+
+ movd %mm0, -4(%edx,%eax,4)
+ subl $4, %edx
+L(start_dst_aligned):
+
+
+ psllq %mm6, %mm1
+ negl %ecx C -shift
+
+ addl $64, %ecx C 64-shift
+ movq %mm3, %mm2
+
+ movd %ecx, %mm7
+ subl $8, %eax C size-8
+
+ psrlq %mm7, %mm3
+
+ por %mm1, %mm3 C mm3 ready to store
+ jc L(finish)
+
+
+ C The comments in mpn_rshift apply here too.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from 48(%ebx,%eax,4)
+ C mm3 dst qword ready to store to 56(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 lshift
+ C mm7 rshift
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq (%ebx,%eax,4), %mm3 C
+ psllq %mm6, %mm1 C
+
+ movq %mm0, 16(%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psrlq %mm7, %mm3 C
+ subl $4, %eax
+
+ por %mm1, %mm3 C
+ jnc L(unroll_loop)
+
+
+
+L(finish):
+ C eax -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %al
+
+ jz L(finish_no_two)
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ subl $2, %eax
+L(finish_no_two):
+
+
+ C eax -4 or -3 representing respectively 0 or 1 limbs remaining
+ C
+ C mm2 src prev qword, from 48(%ebx,%eax,4)
+ C mm3 dst qword, for 56(%edx,%eax,4)
+
+ testb $1, %al
+ movd %mm5, %eax C retval
+
+ popl %edi
+ jz L(finish_zero)
+
+
+ C One extra src limb, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4 edx
+ C --+---------------+---------------+-------+
+ C | mm3 | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra src limb, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 4(%edx), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+
+ movd (%ebx), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm3, 12(%edx)
+ psllq $32, %mm0
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+ psllq %mm6, %mm1
+
+ movq %mm0, 4(%edx)
+ psrlq $32, %mm1
+
+ andl $32, %ecx
+ popl %ebx
+
+ jz L(finish_one_unaligned)
+
+ movd %mm1, (%edx)
+L(finish_one_unaligned):
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra src limbs, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra src limbs, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx+4
+ C --+---------------+-------+
+ C | mm3 | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movd for the unaligned case writes the same data to 4(%edx)
+ C that the movq does for the aligned case.
+
+
+ movq %mm3, 8(%edx)
+ andl $32, %ecx
+
+ psllq %mm6, %mm2
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, (%edx)
+L(finish_zero_unaligned):
+
+ psrlq $32, %mm2
+ popl %ebx
+
+ movd %mm5, %eax C retval
+
+ movd %mm2, 4(%edx)
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
new file mode 100644
index 0000000000..587a07ab3d
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl distance.
+dnl
+dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000000..7672630d57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,460 @@
+dnl Intel P5 mpn_rshift -- mpn right shift.
+dnl
+dnl P5: 1.75 cycles/limb.
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned. Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ decl %eax
+ movl (%ebx), %edi C src low limb
+
+ jnz L(simple)
+
+ shrdl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shrl %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ leal (%ebx,%eax,4), %ebx C &src[size-1]
+
+ movd %ecx, %mm6 C rshift
+ leal -4(%edx,%eax,4), %edx C &dst[size-2]
+
+ psllq $32, %mm5
+ negl %eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop. Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx &src[size-1]
+ C ecx return value
+ C edx &dst[size-2]
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+
+ movq (%ebx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+ psrlq %mm6, %mm5 C return value
+
+ psrlq %mm6, %mm0
+ popl %edi
+
+ movd %mm5, %eax
+ popl %ebx
+
+ movd %mm0, 4(%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ movl $4, %edi
+
+ movd %ecx, %mm6 C rshift
+ testl %edi, %ebx
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source ebx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edx
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%ebx), %mm0 C unaligned load
+
+ psrlq %mm6, %mm0
+ addl $4, %ebx
+
+ decl %eax
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_src_aligned):
+
+
+ movq (%ebx), %mm1
+ testl %edi, %edx
+
+ psrlq %mm6, %mm5 C retval
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source ebx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psrlq %mm6, %mm0
+
+ movd %ecx, %mm6
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_dst_aligned):
+
+
+ movq 8(%ebx), %mm3
+ negl %ecx
+
+ movq %mm3, %mm2 C mm2 src qword
+ addl $64, %ecx
+
+ movd %ecx, %mm7
+ psrlq %mm6, %mm1
+
+ leal -12(%ebx,%eax,4), %ebx
+ leal -20(%edx,%eax,4), %edx
+
+ psllq %mm7, %mm3
+ subl $7, %eax C size-7
+
+ por %mm1, %mm3 C mm3 ready to store
+ negl %eax C -(size-7)
+
+ jns L(finish)
+
+
+ C This loop is the important bit, the rest is just support. Careful
+ C instruction scheduling achieves the claimed 1.75 c/l. The
+ C relevant parts of the pairing rules are:
+ C
+ C - mmx loads and stores execute only in the U pipe
+ C - only one mmx shift in a pair
+ C - wait one cycle before storing an mmx register result
+ C - the usual address generation interlock
+ C
+ C Two qword calculations are slightly interleaved. The instructions
+ C marked "C" belong to the second qword, and the "C prev" one is for
+ C the second qword from the previous iteration.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs, negative
+ C ebx &src[size-12]
+ C ecx
+ C edx &dst[size-12]
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from -8(%ebx,%eax,4)
+ C mm3 dst qword ready to store to -8(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 rshift
+ C mm7 lshift
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq 8(%ebx,%eax,4), %mm3 C
+ psrlq %mm6, %mm1 C
+
+ movq %mm0, (%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psllq %mm7, %mm3 C
+ addl $4, %eax
+
+ por %mm1, %mm3 C
+ js L(unroll_loop)
+
+
+L(finish):
+ C eax 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %al
+
+ jnz L(finish_no_two)
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ addl $2, %eax
+L(finish_no_two):
+
+
+ C eax 2 or 3 representing respectively 1 or 0 limbs remaining
+ C
+ C mm2 src prev qword, from -8(%ebx,%eax,4)
+ C mm3 dst qword, for -8(%edx,%eax,4)
+
+ testb $1, %al
+ popl %edi
+
+ movd %mm5, %eax C retval
+ jnz L(finish_zero)
+
+
+ C One extra limb, destination was aligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +-------+---------------+---------------+--
+ C | | | mm3 |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra limb, destination was unaligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 8(%edx), and in the aligned case
+ C there's a further extra limb of dst to be formed.
+
+
+ movd 8(%ebx), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, (%edx)
+ por %mm2, %mm0
+
+ psrlq %mm6, %mm1
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_one_unaligned)
+
+ C dst was aligned, must store one extra limb
+ movd %mm1, 16(%edx)
+L(finish_one_unaligned):
+
+ movq %mm0, 8(%edx)
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra limbs, destination was aligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra limbs, destination was unaligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is clearly the same data as the
+ C movq for the aligned case, it's just a choice between whether one
+ C or two limbs should be written.
+
+
+ movq %mm3, 4(%edx)
+ psrlq %mm6, %mm2
+
+ movd %mm2, 12(%edx)
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000000..08639eca09
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,79 @@
+dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+dnl
+dnl P5: 13.0 cycles/limb
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST, %edi
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ leal (%edi,%ecx,4), %edi
+ leal (%esi,%ecx,4), %esi
+ negl %ecx
+ xorl %ebx, %ebx
+ ALIGN(8)
+
+L(oop): adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(oop)
+
+ adcl $0, %ebx
+ movl %ebx, %eax
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000000..d9f79a0831
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,135 @@
+dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+dnl
+dnl P5: 14.2 cycles/crossproduct (approx)
+
+
+dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_COUNTER, -4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+ pushl %eax C dummy push for allocating stack slot
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',16)
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ movl PARAM_XSIZE,%eax
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+ leal (%esi,%eax,4),%esi C make xp point at end
+ leal (%edi,%eax,4),%edi C offset wp by xsize
+ negl %ecx C negate j size/index for inner loop
+ xorl %eax,%eax C clear carry
+
+ ALIGN(8)
+L(oop1): adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax C load next limb at xp[j]
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %eax,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop1)
+
+ adcl $0,%ebx
+ movl PARAM_YSIZE,%eax
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ addl $4,%ebp C make ebp point to next y limb
+ movl PARAM_XSIZE,%ecx
+ negl %ecx
+ xorl %ebx,%ebx
+
+ C code at 0x61 here, close enough to aligned
+L(oop2):
+ adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax
+ mull (%ebp)
+ addl %ebx,%eax
+ movl (%edi,%ecx,4),%ebx
+ adcl $0,%edx
+ addl %eax,%ebx
+ movl %ebx,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop2)
+
+ adcl $0,%ebx
+
+ movl %ebx,(%edi)
+ addl $4,%edi
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $4,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ popl %eax C dummy pop for deallocating stack slot
+ ret
+
+EPILOGUE()
+
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000000..e8f5ae8ec8
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,236 @@
+dnl Intel Pentium mpn_rshift -- mpn right shift.
+dnl
+dnl cycles/limb
+dnl P5,P54: 6.0
+dnl P55: 5.375
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) C jump if res_ptr + 1 >= s_ptr
+ leal (%edi,%ebp,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) C jump if s_ptr >= res_ptr + size
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ shrdl( %cl, %ebx, %edx)
+ shrdl( %cl, %eax, %ebx)
+ movl %edx,8(%edi)
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ shrdl( %cl, %edx, %eax)
+ shrdl( %cl, %ebx, %edx)
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl( %cl,%eax,%edx) C compute result limb
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx C compute most significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ shrl %edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl %eax
+ movl %ebx,(%edi)
+ rcrl %edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ rcrl %ebx
+ movl %edx,-8(%edi)
+ rcrl %eax
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ rcrl %edx
+ movl %eax,-16(%edi)
+ rcrl %ebx
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl %eax
+ movl %ebx,-24(%edi)
+ rcrl %edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi C use leal not to clobber carry
+ leal -32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ rcrl %edx
+ movl %ebx,(%edi)
+
+ leal -4(%esi),%esi C use leal not to clobber carry
+ leal -4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ movl $0,%eax
+ rcrl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000000..c8584df13c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,520 @@
+dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+dnl product at around 20x20 limbs.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ pushl %esi
+ pushl %ebx
+
+ movl %eax, %ebx
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl %edx, %esi C dst[1]
+
+ movl 4(%ebx), %eax
+
+ mull %eax C src[1]^2
+
+ movl %eax, %edi C dst[2]
+ movl %edx, %ebp C dst[3]
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ addl %eax, %esi
+ popl %ebx
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %esi, %eax
+
+ adcl %edi, %edx
+ movl %eax, 4(%ecx)
+
+ adcl $0, %ebp
+ popl %esi
+
+ movl %edx, 8(%ecx)
+ movl %ebp, 12(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+
+ cmpl $4, %edx
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_SRC, %ebx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%ebx), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+
+ movl 8(%ebx), %eax
+ pushl %esi C risk of cache bank clash
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ movl (%ebx), %eax
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%ebx), %eax
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %ebx
+ addl %esi, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %esi
+
+ adcl %ebp, %esi
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl %esi, 12(%ecx)
+
+ adcl %edx, %edi
+ popl %esi
+
+ movl 20(%ecx), %eax
+ movl %edi, 16(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ adcl %ebx, %eax C no carry out of this
+ popl %ebx
+
+ movl %eax, 20(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(four_or_more):
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+ C esi
+ C edi
+ C ebp
+ C
+ C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+ pushl %edi
+FRAME_pushl()
+ pushl %esi
+FRAME_pushl()
+
+ pushl %ebp
+FRAME_pushl()
+ leal (%ecx,%edx,4), %edi C dst end of this mul1
+
+ leal (%ebx,%edx,4), %esi C src end
+ movl %ebx, %ebp C src
+
+ negl %edx C -size
+ xorl %ebx, %ebx C clear carry limb and carry flag
+
+ leal 1(%edx), %ecx C -(size-1)
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp src
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(mul1)
+
+
+ C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+ C n=1..size-2.
+ C
+ C The last two products, which are the end corner of the product
+ C triangle, are handled separately to save looping overhead. These
+ C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+ C If size is 4 then it's only these that need to be done.
+ C
+ C In the outer loop %esi is a constant, and %edi just advances by 1
+ C limb each time. The size of the operation decreases by 1 limb
+ C each time.
+
+ C eax
+ C ebx carry (needing carry flag added)
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ adcl $0, %ebx
+ movl PARAM_SIZE, %edx
+
+ movl %ebx, (%edi)
+ subl $4, %edx
+
+ negl %edx
+ jz L(corner)
+
+
+L(outer):
+ C ebx previous carry limb to store
+ C edx outer loop counter (negative)
+ C esi &src[size]
+ C edi dst, pointing at stored carry limb of previous loop
+
+ pushl %edx C new outer loop counter
+ leal -2(%edx), %ecx
+
+ movl %ebx, (%edi)
+ addl $4, %edi
+
+ addl $4, %ebp
+ xorl %ebx, %ebx C initial carry limb, clear carry flag
+
+L(inner):
+ C eax scratch
+ C ebx carry (needing carry flag added)
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi dst end of this addmul
+ C ebp &src[j]
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(inner)
+
+
+ adcl $0, %ebx
+ popl %edx C outer loop counter
+
+ incl %edx
+ jnz L(outer)
+
+
+ movl %ebx, (%edi)
+
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-4]
+
+ movl -8(%esi), %eax
+ movl -4(%edi), %ebx C risk of data cache bank clash here
+
+ mull -12(%esi) C src[size-2]*src[size-3]
+
+ addl %eax, %ebx
+ movl %edx, %ecx
+
+ adcl $0, %ecx
+ movl -4(%esi), %eax
+
+ mull -12(%esi) C src[size-1]*src[size-3]
+
+ addl %ecx, %eax
+ movl (%edi), %ecx
+
+ adcl $0, %edx
+ movl %ebx, -4(%edi)
+
+ addl %eax, %ecx
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ movl -4(%esi), %eax
+
+ mull -8(%esi) C src[size-1]*src[size-2]
+
+ movl %ecx, 0(%edi)
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ movl PARAM_SIZE, %eax
+
+ negl %eax
+ movl %ebx, 4(%edi)
+
+ addl $1, %eax C -(size-1) and clear carry
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+ C eax counter, negative
+ C ebx next limb
+ C ecx
+ C edx
+ C esi
+ C edi &dst[2*size-4]
+ C ebp
+
+ movl 12(%edi,%eax,8), %ebx
+
+ rcll %ebx
+ movl 16(%edi,%eax,8), %ecx
+
+ rcll %ecx
+ movl %ebx, 12(%edi,%eax,8)
+
+ movl %ecx, 16(%edi,%eax,8)
+ incl %eax
+
+ jnz L(lshift)
+
+
+ adcl %eax, %eax C high bit out
+ movl PARAM_SRC, %esi
+
+ movl PARAM_SIZE, %ecx C risk of cache bank clash
+ movl %eax, 12(%edi) C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+ leal (%esi,%ecx,4), %esi C src end
+
+ negl %ecx
+
+ mull %eax
+
+ movl %eax, 16(%edi,%ecx,8) C dst[0]
+ movl %edx, %ebx
+
+ addl $1, %ecx C size-1 and clear carry
+
+L(diag):
+ C eax scratch (low product)
+ C ebx carry limb
+ C ecx counter, negative
+ C edx scratch (high product)
+ C esi &src[size]
+ C edi &dst[2*size-4]
+ C ebp scratch (fetched dst limbs)
+
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %ebx
+
+ mull %eax
+
+ movl 16-4(%edi,%ecx,8), %ebp
+
+ addl %ebp, %ebx
+ movl 16(%edi,%ecx,8), %ebp
+
+ adcl %eax, %ebp
+ movl %ebx, 16-4(%edi,%ecx,8)
+
+ movl %ebp, 16(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(diag)
+
+
+ adcl $0, %edx
+ movl 16-4(%edi), %eax C dst most significant limb
+
+ addl %eax, %edx
+ popl %ebp
+
+ movl %edx, 16-4(%edi)
+ popl %esi C risk of cache bank clash
+
+ popl %edi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000000..c9881fd966
--- /dev/null
+++ b/rts/gmp/mpn/x86/rshift.asm
@@ -0,0 +1,92 @@
+dnl x86 mpn_rshift -- mpn right shift.
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+deflit(`FRAME',12)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%edx
+ movl PARAM_SHIFT,%ecx
+
+ leal -4(%edi,%edx,4),%edi
+ leal (%esi,%edx,4),%esi
+ negl %edx
+
+ movl (%esi,%edx,4),%ebx C read least significant limb
+ xorl %eax,%eax
+ shrdl( %cl, %ebx, %eax) C compute carry limb
+ incl %edx
+ jz L(end)
+ pushl %eax C push carry limb onto stack
+ testb $1,%dl
+ jnz L(1) C enter loop in the middle
+ movl %ebx,%eax
+
+ ALIGN(8)
+L(oop): movl (%esi,%edx,4),%ebx C load next higher limb
+ shrdl( %cl, %ebx, %eax) C compute result limb
+ movl %eax,(%edi,%edx,4) C store it
+ incl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shrdl( %cl, %eax, %ebx)
+ movl %ebx,(%edi,%edx,4)
+ incl %edx
+ jnz L(oop)
+
+ shrl %cl,%eax C compute most significant limb
+ movl %eax,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+L(end): shrl %cl,%ebx C compute most significant limb
+ movl %ebx,(%edi) C store it
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000000..9fe022b107
--- /dev/null
+++ b/rts/gmp/mpn/x86/udiv.asm
@@ -0,0 +1,44 @@
+dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW, 12)
+defframe(PARAM_HIGH, 8)
+defframe(PARAM_REMPTR, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+ movl PARAM_LOW, %eax
+ movl PARAM_HIGH, %edx
+ divl PARAM_DIVISOR
+ movl PARAM_REMPTR, %ecx
+ movl %edx, (%ecx)
+ ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000000..3d289d1784
--- /dev/null
+++ b/rts/gmp/mpn/x86/umul.asm
@@ -0,0 +1,43 @@
+dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2, 12)
+defframe(PARAM_M1, 8)
+defframe(PARAM_LOWPTR, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+ movl PARAM_LOWPTR, %ecx
+ movl PARAM_M1, %eax
+ mull PARAM_M2
+ movl %eax, (%ecx)
+ movl %edx, %eax
+ ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000000..2dad698002
--- /dev/null
+++ b/rts/gmp/mpn/x86/x86-defs.m4
@@ -0,0 +1,713 @@
+divert(-1)
+
+dnl m4 macros for x86 assembler.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Notes:
+dnl
+dnl m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl problems are,
+dnl
+dnl 1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl foo(%ebx) expands as a macro rather than a constant. This is worked
+dnl around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl 2. Immediates in macro definitions need a space or `' to stop the $
+dnl looking like a macro parameter. For example,
+dnl
+dnl define(foo, `mov $ 123, %eax')
+dnl
+dnl This is only a problem in macro definitions, not in ordinary text,
+dnl nor in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(BYTES_PER_MP_LIMB, 4)
+
+
+dnl --------------------------------------------------------------------------
+dnl Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
+dnl Nesting and overlapping not allowed.
+dnl
+
+
+dnl Usage: PROLOGUE(functionname)
+dnl
+dnl Generate a function prologue. functionname gets GSYM_PREFIX added.
+dnl Examples,
+dnl
+dnl PROLOGUE(mpn_add_n)
+dnl PROLOGUE(somefun)
+
+define(`PROLOGUE',
+m4_assert_numargs(1)
+m4_assert_defined(`PROLOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
+)')dnl
+m4_file_seen()dnl
+define(`PROLOGUE_current_function',`$1')dnl
+PROLOGUE_cpu(GSYM_PREFIX`'$1)')
+
+
+dnl Usage: EPILOGUE()
+dnl
+dnl Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
+dnl instead of the long PROLOGUE_current_function symbol.
+
+define(`EPILOGUE',
+m4_assert_numargs(0)
+m4_assert_defined(`EPILOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`EPILOGUE'() with no `PROLOGUE'()
+)')dnl
+EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
+undefine(`PROLOGUE_current_function')')
+
+m4wrap_prepend(
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
+')')')
+
+
+dnl Usage: PROLOGUE_assert_inside()
+dnl
+dnl Use this unquoted on a line on its own at the start of a macro
+dnl definition to add some code to check the macro is only used inside a
+dnl PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
+dnl defined.
+
+define(PROLOGUE_assert_inside,
+m4_assert_numargs(0)
+``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
+
+define(PROLOGUE_assert_inside_internal,
+m4_assert_numargs(1)
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
+')')')
+
+
+dnl Usage: L(labelname)
+dnl LF(functionname,labelname)
+dnl
+dnl Generate a local label in the current or given function. For LF(),
+dnl functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
+dnl
+dnl For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
+dnl
+dnl L(bar) => L__gmpn_add_n__bar
+dnl LF(somefun,bar) => Lsomefun__bar
+dnl
+dnl The funtion name and label name get two underscores between them rather
+dnl than one to guard against clashing with a separate external symbol that
+dnl happened to be called functionname_labelname. (Though this would only
+dnl happen if the local label prefix is is empty.) Underscores are used so
+dnl the whole label will still be a valid C identifier and so can be easily
+dnl used in gdb.
+
+dnl LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
+dnl L macro and making an infinite recursion.
+define(LF,
+m4_assert_numargs(2)
+m4_assert_defined(`LSYM_PREFIX')
+`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
+
+define(`L',
+m4_assert_numargs(1)
+PROLOGUE_assert_inside()
+`LF(PROLOGUE_current_function,`$1')')
+
+
+dnl Called: PROLOGUE_cpu(gsym)
+dnl EPILOGUE_cpu(gsym)
+
+define(PROLOGUE_cpu,
+m4_assert_numargs(1)
+ `GLOBL $1
+ TYPE($1,`function')
+$1:')
+
+define(EPILOGUE_cpu,
+m4_assert_numargs(1)
+` SIZE($1,.-$1)')
+
+
+
+dnl --------------------------------------------------------------------------
+dnl Various x86 macros.
+dnl
+
+
+dnl Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl This is useful for testing, for example align to something very strict
+dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl Generally you wouldn't execute across the padding, but it's done with
+dnl nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,` nop
+')')
+
+
+dnl Usage: defframe(name,offset)
+dnl
+dnl Make a definition like the following with which to access a parameter
+dnl or variable on the stack.
+dnl
+dnl define(name,`FRAME+offset(%esp)')
+dnl
+dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl zero offset is wanted.
+dnl
+dnl The new macro also gets a check that when it's used FRAME is actually
+dnl defined, and that the final %esp offset isn't negative, which would
+dnl mean an attempt to access something below the current %esp.
+dnl
+dnl deflit() is used rather than a plain define(), so the new macro won't
+dnl delete any following parenthesized expression. name(%edi) will come
+dnl out say as 16(%esp)(%edi). This isn't valid assembler and should
+dnl provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl See README.family for more on the suggested way to access the stack
+dnl frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl Usage: FRAME_pushl()
+dnl FRAME_popl()
+dnl FRAME_addl_esp(n)
+dnl FRAME_subl_esp(n)
+dnl
+dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl %esp of n bytes.
+dnl
+dnl Using these macros is completely optional. Sometimes it makes more
+dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl jumps and different sequences of FRAME values need to be used in
+dnl different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl Usage: defframe_pushl(name)
+dnl
+dnl Do a combination of a FRAME_pushl() and a defframe() to name the stack
+dnl location just pushed. This should come after a pushl instruction.
+dnl Putting it on the same line works and avoids lengthening the code. For
+dnl example,
+dnl
+dnl pushl %eax defframe_pushl(VAR_COUNTER)
+dnl
+dnl Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl current value without tracking future changes.
+
+define(defframe_pushl,
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl --------------------------------------------------------------------------
+dnl Assembler instruction macros.
+dnl
+
+
+dnl Usage: emms_or_femms
+dnl femms_available_p
+dnl
+dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl femms instruction is available. emms_or_femms expands to femms if
+dnl available, or emms if not.
+dnl
+dnl emms_or_femms is meant for use in the K6 directory where plain K6
+dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl supported together.
+dnl
+dnl On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+ `HAVE_TARGET_CPU_k62',
+ `HAVE_TARGET_CPU_k63',
+ `HAVE_TARGET_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl Usage: femms
+dnl
+dnl The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
+dnl following is a replacement using .byte.
+dnl
+dnl If femms isn't available, an emms is generated instead, for convenience
+dnl when testing on a machine without femms.
+
+define(femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,
+`.byte 15,14 C AMD 3DNow femms',
+`emms`'dnl
+m4_warning(`warning, using emms in place of femms, use for testing only
+')')')
+
+
+dnl Usage: jadcl0(op)
+dnl
+dnl Issue a jnc/incl as a substitute for adcl $0,op. This isn't an exact
+dnl replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl misprediction penalty is small, and the multiply algorithm used leads
+dnl to a carry bit on average only 1/4 of the time.
+dnl
+dnl jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
+dnl comparison. For example,
+dnl
+dnl define(`jadcl0_disabled',1)
+dnl
+dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl the same size as an adcl. This makes it possible to use the exact same
+dnl computed jump code when testing the relative speed of jnc/incl and adcl
+dnl with jadcl0_disabled.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+ `adcl $`'0, $1',
+ `jnc 1f
+ incl $1
+1:dnl')')
+
+
+dnl Usage: cmov_available_p
+dnl
+dnl Expand to 1 if cmov is available, 0 if not.
+
+define(cmov_available_p,
+`m4_ifdef_anyof_p(
+ `HAVE_TARGET_CPU_pentiumpro',
+ `HAVE_TARGET_CPU_pentium2',
+ `HAVE_TARGET_CPU_pentium3',
+ `HAVE_TARGET_CPU_athlon')')
+
+
+dnl Usage: x86_lookup(target, key,value, key,value, ...)
+dnl x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl Look for `target' among the `key' parameters.
+dnl
+dnl x86_lookup expands to the corresponding `value', or generates an error
+dnl if `target' isn't found.
+dnl
+dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2', `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl Usage: x86_opcode_reg32(reg)
+dnl x86_opcode_reg32_p(reg)
+dnl
+dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl 32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl Usage: x86_opcode_tttn(cond)
+dnl
+dnl Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o', 0,
+`no', 1,
+`b', 2, `c', 2, `nae',2,
+`nb', 3, `nc', 3, `ae', 3,
+`e', 4, `z', 4,
+`ne', 5, `nz', 5,
+`be', 6, `na', 6,
+`nbe', 7, `a', 7,
+`s', 8,
+`ns', 9,
+`p', 10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l', 12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g', 15')
+
+
+dnl Usage: cmovCC(srcreg,dstreg)
+dnl
+dnl Generate a cmov instruction if the target supports cmov, or simulate it
+dnl with a conditional jump if not (the latter being meant only for
+dnl testing). For example,
+dnl
+dnl cmovz( %eax, %ebx)
+dnl
+dnl cmov instructions are generated using .byte sequences, since only
+dnl recent versions of gas know cmov.
+dnl
+dnl The source operand can only be a plain register. (m4 code implementing
+dnl full memory addressing modes exists, believe it or not, but isn't
+dnl currently needed and isn't included.)
+dnl
+dnl All the standard conditions are defined. Attempting to use one without
+dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl an error. This ensures the necessary .byte sequences aren't
+dnl accidentally missed.
+
+dnl Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl Called: define_cmov(cond,tttn)
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+
+dnl Called: cmov_internal(name,cond,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(5)
+`ifelse(cmov_available_p,1,
+`cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
+`m4_warning(`warning, simulating cmov with jump, use for testing only
+')cmov_simulate(`$2',`$4',`$5')')')
+
+dnl Called: cmov_simulate(cond,src,dst)
+dnl If this is going to be used with memory operands for the source it will
+dnl need to be changed to do a fetch even if the condition is false, so as
+dnl to trigger exceptions the same way a real cmov does.
+define(cmov_simulate,
+m4_assert_numargs(3)
+ `j$1 1f C cmov$1 $2, $3
+ jmp 2f
+1: movl $2, $3
+2:')
+
+dnl Called: cmov_bytes_tttn(name,tttn,src,dst)
+define(cmov_bytes_tttn,
+m4_assert_numargs(4)
+`.byte dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+ C `$1 $3, $4'')
+
+
+dnl Usage: loop_or_decljnz label
+dnl
+dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
+dnl is better. "loop" is better on K6 and probably on 386, on other chips
+dnl separate decl/jnz is better.
+dnl
+dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
+dnl this loop_or_decljnz variation is enough to let the code be shared by
+dnl all chips.
+
+define(loop_or_decljnz,
+`ifelse(loop_is_better_p,1,
+ `loop',
+ `decl %ecx
+ jnz')')
+
+define(loop_is_better_p,
+`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
+ `HAVE_TARGET_CPU_k62',
+ `HAVE_TARGET_CPU_k63',
+ `HAVE_TARGET_CPU_i386')')
+
+
+dnl Usage: Zdisp(inst,op,op,op)
+dnl
+dnl Generate explicit .byte sequences if necessary to force a byte-sized
+dnl zero displacement on an instruction. For example,
+dnl
+dnl Zdisp( movl, 0,(%esi), %eax)
+dnl
+dnl expands to
+dnl
+dnl .byte 139,70,0 C movl 0(%esi), %eax
+dnl
+dnl If the displacement given isn't 0, then normal assembler code is
+dnl generated. For example,
+dnl
+dnl Zdisp( movl, 4,(%esi), %eax)
+dnl
+dnl expands to
+dnl
+dnl movl 4(%esi), %eax
+dnl
+dnl This means a single Zdisp() form can be used with an expression for the
+dnl displacement, and .byte will be used only if necessary. The
+dnl displacement argument is eval()ed.
+dnl
+dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl implemented with a table of instructions and encodings. A new entry is
+dnl needed for any different operation or registers.
+
+define(Zdisp,
+`define(`Zdisp_found',0)dnl
+Zdisp_match( movl, %eax, 0,(%edi), `137,71,0', $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0', $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi), `137,119,0', $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0', $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0', $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax, `139,70,0', $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0', $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0', $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi), `1,119,0', $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0', $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0', $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0', $@)`'dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%esi), %mm0, `15,111,70,0', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edi), `15,127,71,0', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_match,
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+ && m4_stringequal_p(`$2',0)
+ && m4_stringequal_p(`$3',`$8')
+ && m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+` .byte $5 C `$1 0$3, $4'',
+` $6 $7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+ && m4_stringequal_p(`$2',`$7')
+ && m4_stringequal_p(`$3',0)
+ && m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+` .byte $5 C `$1 $2, 0$4'',
+` $6 $7, $8$9')')')')
+
+
+dnl Usage: shldl(count,src,dst)
+dnl shrdl(count,src,dst)
+dnl shldw(count,src,dst)
+dnl shrdw(count,src,dst)
+dnl
+dnl Generate a double-shift instruction, possibly omitting a %cl count
+dnl parameter if that's what the assembler requires, as indicated by
+dnl WANT_SHLDL_CL in config.m4. For example,
+dnl
+dnl shldl( %cl, %eax, %ebx)
+dnl
+dnl turns into either
+dnl
+dnl shldl %cl, %eax, %ebx
+dnl or
+dnl shldl %eax, %ebx
+dnl
+dnl Immediate counts are always passed through unchanged. For example,
+dnl
+dnl shrdl( $2, %esi, %edi)
+dnl becomes
+dnl shrdl $2, %esi, %edi
+dnl
+dnl
+dnl If you forget to use the macro form "shldl( ...)" and instead write
+dnl just a plain "shldl ...", an error results. This ensures the necessary
+dnl variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1' `$3', `$4'',
+``$1' `$2', `$3', `$4'')')
+
+
+dnl Usage: ASSERT(cond, instructions)
+dnl
+dnl If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl flags condition to then be satisfied. For example,
+dnl
+dnl ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl The instructions can be omitted to just assert a flags condition with
+dnl no extra calculation. For example,
+dnl
+dnl ASSERT(nc)
+dnl
+dnl When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl flags, but the instructions themselves must preserve any registers that
+dnl matter. FRAME is adjusted for the push and pop, so the instructions
+dnl given can use defframe() stack variables.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+`ifelse(WANT_ASSERT,1,
+ `C ASSERT
+ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')')
+ $2
+ j`$1' 1f
+ ud2 C assertion failed
+1:
+ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')')
+')')
+
+
+dnl Usage: movl_text_address(label,register)
+dnl
+dnl Get the address of a text segment label, using either a plain movl or a
+dnl position-independent calculation, as necessary. For example,
+dnl
+dnl movl_code_address(L(foo),%eax)
+dnl
+dnl This macro is only meant for use in ASSERT()s or when testing, since
+dnl the PIC sequence it generates will want to be done with a ret balancing
+dnl the call on CPUs with return address branch predition.
+dnl
+dnl The addl generated here has a backward reference to 1b, and so won't
+dnl suffer from the two forwards references bug in old gas (described in
+dnl mpn/x86/README.family).
+
+define(movl_text_address,
+`ifdef(`PIC',
+ `call 1f
+1: popl $2 C %eip
+ addl `$'$1-1b, $2',
+ `movl `$'$1, $2')')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/z8000/add_n.s b/rts/gmp/mpn/z8000/add_n.s
new file mode 100644
index 0000000000..3a136107fe
--- /dev/null
+++ b/rts/gmp/mpn/z8000/add_n.s
@@ -0,0 +1,53 @@
+! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ unseg
+ .text
+ even
+ global ___gmpn_add_n
+___gmpn_add_n:
+ pop r0,@r6
+ pop r1,@r5
+ add r0,r1
+ ld @r7,r0
+ dec r4
+ jr eq,Lend
+Loop: pop r0,@r6
+ pop r1,@r5
+ adc r0,r1
+ inc r7,#2
+ ld @r7,r0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ adc r2,r2
+ ret t
diff --git a/rts/gmp/mpn/z8000/gmp-mparam.h b/rts/gmp/mpn/z8000/gmp-mparam.h
new file mode 100644
index 0000000000..4216df673c
--- /dev/null
+++ b/rts/gmp/mpn/z8000/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 16
+#define BYTES_PER_MP_LIMB 2
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 16
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/z8000/mul_1.s b/rts/gmp/mpn/z8000/mul_1.s
new file mode 100644
index 0000000000..20fadd340a
--- /dev/null
+++ b/rts/gmp/mpn/z8000/mul_1.s
@@ -0,0 +1,68 @@
+! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! size r5
+! s2_limb r4
+
+ unseg
+ .text
+ even
+ global ___gmpn_mul_1
+___gmpn_mul_1:
+ sub r2,r2 ! zero carry limb
+ and r4,r4
+ jr mi,Lneg
+
+Lpos: pop r1,@r6
+ ld r9,r1
+ mult rr8,r4
+ and r1,r1 ! shift msb of loaded limb into cy
+ jr mi,Lp ! branch if loaded limb's msb is set
+ add r8,r4 ! hi_limb += sign_comp2
+Lp: add r9,r2 ! lo_limb += cy_limb
+ xor r2,r2
+ adc r2,r8
+ ld @r7,r9
+ inc r7,#2
+ dec r5
+ jr ne,Lpos
+ ret t
+
+Lneg: pop r1,@r6
+ ld r9,r1
+ mult rr8,r4
+ add r8,r1 ! hi_limb += sign_comp1
+ and r1,r1
+ jr mi,Ln
+ add r8,r4 ! hi_limb += sign_comp2
+Ln: add r9,r2 ! lo_limb += cy_limb
+ xor r2,r2
+ adc r2,r8
+ ld @r7,r9
+ inc r7,#2
+ dec r5
+ jr ne,Lneg
+ ret t
diff --git a/rts/gmp/mpn/z8000/sub_n.s b/rts/gmp/mpn/z8000/sub_n.s
new file mode 100644
index 0000000000..bd9a7ad409
--- /dev/null
+++ b/rts/gmp/mpn/z8000/sub_n.s
@@ -0,0 +1,54 @@
+! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ unseg
+ .text
+ even
+ global ___gmpn_sub_n
+___gmpn_sub_n:
+ pop r0,@r6
+ pop r1,@r5
+ sub r0,r1
+ ld @r7,r0
+ dec r4
+ jr eq,Lend
+Loop: pop r0,@r6
+ pop r1,@r5
+ sbc r0,r1
+ inc r7,#2
+ ld @r7,r0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ adc r2,r2
+ ret t
diff --git a/rts/gmp/mpn/z8000x/add_n.s b/rts/gmp/mpn/z8000x/add_n.s
new file mode 100644
index 0000000000..7f130785c5
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/add_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal,
+! non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ segm
+ .text
+ even
+ global ___gmpn_add_n
+___gmpn_add_n:
+ popl rr0,@r6
+ popl rr8,@r5
+ addl rr0,rr8
+ ldl @r7,rr0
+ dec r4
+ jr eq,Lend
+Loop: popl rr0,@r6
+ popl rr8,@r5
+ adc r1,r9
+ adc r0,r8
+ inc r7,#4
+ ldl @r7,rr0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ ld r3,r4
+ adc r2,r2
+ ret t
diff --git a/rts/gmp/mpn/z8000x/sub_n.s b/rts/gmp/mpn/z8000x/sub_n.s
new file mode 100644
index 0000000000..f416d1d6eb
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/sub_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the
+! same length > 0 and store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ segm
+ .text
+ even
+ global ___gmpn_sub_n
+___gmpn_sub_n:
+ popl rr0,@r6
+ popl rr8,@r5
+ subl rr0,rr8
+ ldl @r7,rr0
+ dec r4
+ jr eq,Lend
+Loop: popl rr0,@r6
+ popl rr8,@r5
+ sbc r1,r9
+ sbc r0,r8
+ inc r7,#4
+ ldl @r7,rr0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ ld r3,r4
+ adc r2,r2
+ ret t