summaryrefslogtreecommitdiff
path: root/gmp/mpn
diff options
context:
space:
mode:
authorPedro Alvarez <pedro.alvarez@codethink.co.uk>2016-05-27 17:39:31 +0100
committerPedro Alvarez <pedro.alvarez@codethink.co.uk>2016-05-27 17:53:32 +0100
commit26c75cf8267919f81a1759c9c965a52c660233f9 (patch)
treecf2a39cf56c2c8ac45760854413ab233e6263974 /gmp/mpn
parent56892c1d217baea02092b51a09bbc924130ca84c (diff)
downloadgcc-tarball-baserock/pedroalvarez/gcc-5.3.0-gmp432.tar.gz
Diffstat (limited to 'gmp/mpn')
-rw-r--r--gmp/mpn/Makeasm.am36
-rw-r--r--gmp/mpn/Makefile.am73
-rw-r--r--gmp/mpn/Makefile.in597
-rw-r--r--gmp/mpn/README25
-rw-r--r--gmp/mpn/a29k/add_n.s118
-rw-r--r--gmp/mpn/a29k/addmul_1.s111
-rw-r--r--gmp/mpn/a29k/lshift.s91
-rw-r--r--gmp/mpn/a29k/mul_1.s95
-rw-r--r--gmp/mpn/a29k/rshift.s87
-rw-r--r--gmp/mpn/a29k/sub_n.s118
-rw-r--r--gmp/mpn/a29k/submul_1.s114
-rw-r--r--gmp/mpn/a29k/udiv.s28
-rw-r--r--gmp/mpn/a29k/umul.s27
-rw-r--r--gmp/mpn/alpha/README34
-rw-r--r--gmp/mpn/alpha/add_n.asm239
-rw-r--r--gmp/mpn/alpha/addmul_1.asm31
-rw-r--r--gmp/mpn/alpha/alpha-defs.m433
-rw-r--r--gmp/mpn/alpha/aorslsh1_n.asm226
-rw-r--r--gmp/mpn/alpha/aorslsh2_n.asm167
-rw-r--r--gmp/mpn/alpha/bdiv_dbm1c.asm31
-rw-r--r--gmp/mpn/alpha/cntlz.asm31
-rw-r--r--gmp/mpn/alpha/copyd.asm31
-rw-r--r--gmp/mpn/alpha/copyi.asm31
-rw-r--r--gmp/mpn/alpha/default.m449
-rw-r--r--gmp/mpn/alpha/dive_1.c27
-rw-r--r--gmp/mpn/alpha/diveby3.asm (renamed from gmp/mpn/alpha/ev5/diveby3.asm)40
-rw-r--r--gmp/mpn/alpha/divrem_2.asm38
-rw-r--r--gmp/mpn/alpha/ev5/add_n.asm146
-rw-r--r--gmp/mpn/alpha/ev5/com_n.asm (renamed from gmp/mpn/alpha/com.asm)39
-rw-r--r--gmp/mpn/alpha/ev5/gmp-mparam.h242
-rw-r--r--gmp/mpn/alpha/ev5/lshift.asm171
-rw-r--r--gmp/mpn/alpha/ev5/rshift.asm169
-rw-r--r--gmp/mpn/alpha/ev5/sub_n.asm146
-rw-r--r--gmp/mpn/alpha/ev6/add_n.asm31
-rw-r--r--gmp/mpn/alpha/ev6/aorslsh1_n.asm172
-rw-r--r--gmp/mpn/alpha/ev6/aorsmul_1.asm33
-rw-r--r--gmp/mpn/alpha/ev6/gmp-mparam.h257
-rw-r--r--gmp/mpn/alpha/ev6/mod_1_4.asm337
-rw-r--r--gmp/mpn/alpha/ev6/mul_1.asm35
-rw-r--r--gmp/mpn/alpha/ev6/nails/README27
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_1.asm35
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_2.asm33
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_3.asm33
-rw-r--r--gmp/mpn/alpha/ev6/nails/addmul_4.asm33
-rw-r--r--gmp/mpn/alpha/ev6/nails/aors_n.asm33
-rw-r--r--gmp/mpn/alpha/ev6/nails/gmp-mparam.h38
-rw-r--r--gmp/mpn/alpha/ev6/nails/mul_1.asm37
-rw-r--r--gmp/mpn/alpha/ev6/nails/submul_1.asm37
-rw-r--r--[-rwxr-xr-x]gmp/mpn/alpha/ev6/slot.pl39
-rw-r--r--gmp/mpn/alpha/ev6/sqr_diagonal.asm115
-rw-r--r--gmp/mpn/alpha/ev6/sub_n.asm31
-rw-r--r--gmp/mpn/alpha/ev67/gcd_1.asm31
-rw-r--r--gmp/mpn/alpha/ev67/hamdist.asm31
-rw-r--r--gmp/mpn/alpha/ev67/popcount.asm31
-rw-r--r--gmp/mpn/alpha/gmp-mparam.h45
-rw-r--r--gmp/mpn/alpha/invert_limb.asm399
-rw-r--r--gmp/mpn/alpha/lshift.asm193
-rw-r--r--gmp/mpn/alpha/mod_34lsub1.asm31
-rw-r--r--gmp/mpn/alpha/mode1o.asm33
-rw-r--r--gmp/mpn/alpha/mul_1.asm31
-rw-r--r--gmp/mpn/alpha/rshift.asm193
-rw-r--r--gmp/mpn/alpha/sec_tabselect.asm137
-rw-r--r--gmp/mpn/alpha/sqr_diag_addlsh1.asm93
-rw-r--r--gmp/mpn/alpha/sqr_diagonal.asm65
-rw-r--r--gmp/mpn/alpha/sub_n.asm243
-rw-r--r--gmp/mpn/alpha/submul_1.asm31
-rw-r--r--gmp/mpn/alpha/umul.asm31
-rw-r--r--gmp/mpn/alpha/unicos.m438
-rw-r--r--gmp/mpn/arm/README35
-rw-r--r--gmp/mpn/arm/add_n.asm69
-rw-r--r--gmp/mpn/arm/addmul_1.asm107
-rw-r--r--gmp/mpn/arm/aors_n.asm112
-rw-r--r--gmp/mpn/arm/aorslsh1_n.asm167
-rw-r--r--gmp/mpn/arm/aorsmul_1.asm135
-rw-r--r--gmp/mpn/arm/arm-defs.m467
-rw-r--r--gmp/mpn/arm/bdiv_dbm1c.asm113
-rw-r--r--gmp/mpn/arm/cnd_aors_n.asm134
-rw-r--r--gmp/mpn/arm/com.asm75
-rw-r--r--gmp/mpn/arm/copyd.asm86
-rw-r--r--gmp/mpn/arm/copyi.asm82
-rw-r--r--gmp/mpn/arm/dive_1.asm151
-rw-r--r--gmp/mpn/arm/gmp-mparam.h184
-rw-r--r--gmp/mpn/arm/invert_limb.asm164
-rw-r--r--gmp/mpn/arm/logops_n.asm139
-rw-r--r--gmp/mpn/arm/lshift.asm88
-rw-r--r--gmp/mpn/arm/lshiftc.asm95
-rw-r--r--gmp/mpn/arm/mod_34lsub1.asm121
-rw-r--r--gmp/mpn/arm/mode1o.asm92
-rw-r--r--gmp/mpn/arm/mul_1.asm58
-rw-r--r--gmp/mpn/arm/neon/README2
-rw-r--r--gmp/mpn/arm/neon/hamdist.asm194
-rw-r--r--gmp/mpn/arm/neon/lorrshift.asm279
-rw-r--r--gmp/mpn/arm/neon/lshiftc.asm257
-rw-r--r--gmp/mpn/arm/neon/popcount.asm166
-rw-r--r--gmp/mpn/arm/neon/sec_tabselect.asm140
-rw-r--r--gmp/mpn/arm/rsh1aors_n.asm124
-rw-r--r--gmp/mpn/arm/rshift.asm86
-rw-r--r--gmp/mpn/arm/sec_tabselect.asm131
-rw-r--r--gmp/mpn/arm/sub_n.asm71
-rw-r--r--gmp/mpn/arm/submul_1.asm107
-rw-r--r--gmp/mpn/arm/udiv.asm39
-rw-r--r--gmp/mpn/arm/v5/gcd_1.asm120
-rw-r--r--gmp/mpn/arm/v5/mod_1_1.asm129
-rw-r--r--gmp/mpn/arm/v5/mod_1_2.asm156
-rw-r--r--gmp/mpn/arm/v6/addmul_1.asm111
-rw-r--r--gmp/mpn/arm/v6/addmul_2.asm138
-rw-r--r--gmp/mpn/arm/v6/addmul_3.asm187
-rw-r--r--gmp/mpn/arm/v6/dive_1.asm149
-rw-r--r--gmp/mpn/arm/v6/gmp-mparam.h157
-rw-r--r--gmp/mpn/arm/v6/mode1o.asm95
-rw-r--r--gmp/mpn/arm/v6/mul_1.asm114
-rw-r--r--gmp/mpn/arm/v6/mul_2.asm131
-rw-r--r--gmp/mpn/arm/v6/popham.asm138
-rw-r--r--gmp/mpn/arm/v6/sqr_basecase.asm518
-rw-r--r--gmp/mpn/arm/v6/submul_1.asm125
-rw-r--r--gmp/mpn/arm/v6t2/divrem_1.asm212
-rw-r--r--gmp/mpn/arm/v6t2/gcd_1.asm115
-rw-r--r--gmp/mpn/arm/v7a/cora15/addmul_1.asm145
-rw-r--r--gmp/mpn/arm/v7a/cora15/aors_n.asm162
-rw-r--r--gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm158
-rw-r--r--gmp/mpn/arm/v7a/cora15/com.asm180
-rw-r--r--gmp/mpn/arm/v7a/cora15/gmp-mparam.h197
-rw-r--r--gmp/mpn/arm/v7a/cora15/logops_n.asm253
-rw-r--r--gmp/mpn/arm/v7a/cora15/mul_1.asm104
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm144
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/com.asm97
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyd.asm110
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/copyi.asm90
-rw-r--r--gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm177
-rw-r--r--gmp/mpn/arm/v7a/cora15/submul_1.asm159
-rw-r--r--gmp/mpn/arm/v7a/cora9/gmp-mparam.h209
-rw-r--r--gmp/mpn/arm64/aors_n.asm98
-rw-r--r--gmp/mpn/arm64/aorsmul_1.asm122
-rw-r--r--gmp/mpn/arm64/cnd_aors_n.asm99
-rw-r--r--gmp/mpn/arm64/copyd.asm93
-rw-r--r--gmp/mpn/arm64/copyi.asm77
-rw-r--r--gmp/mpn/arm64/gcd_1.asm125
-rw-r--r--gmp/mpn/arm64/invert_limb.asm83
-rw-r--r--gmp/mpn/arm64/logops_n.asm106
-rw-r--r--gmp/mpn/arm64/mul_1.asm98
-rw-r--r--gmp/mpn/asm-defs.m4188
-rw-r--r--gmp/mpn/clipper/add_n.s46
-rw-r--r--gmp/mpn/clipper/mul_1.s45
-rw-r--r--gmp/mpn/clipper/sub_n.s46
-rwxr-xr-xgmp/mpn/cpp-ccas33
-rw-r--r--gmp/mpn/cray/README31
-rw-r--r--gmp/mpn/cray/add_n.c25
-rw-r--r--gmp/mpn/cray/cfp/addmul_1.c25
-rw-r--r--gmp/mpn/cray/cfp/mul_1.c25
-rw-r--r--gmp/mpn/cray/cfp/mulwwc90.s45
-rw-r--r--gmp/mpn/cray/cfp/mulwwj90.s45
-rw-r--r--gmp/mpn/cray/cfp/submul_1.c25
-rw-r--r--gmp/mpn/cray/gmp-mparam.h38
-rw-r--r--gmp/mpn/cray/hamdist.c25
-rw-r--r--gmp/mpn/cray/ieee/addmul_1.c27
-rw-r--r--gmp/mpn/cray/ieee/gmp-mparam.h39
-rw-r--r--gmp/mpn/cray/ieee/invert_limb.c31
-rw-r--r--gmp/mpn/cray/ieee/mul_1.c25
-rw-r--r--gmp/mpn/cray/ieee/mul_basecase.c25
-rw-r--r--gmp/mpn/cray/ieee/sqr_basecase.c25
-rw-r--r--gmp/mpn/cray/ieee/submul_1.c27
-rw-r--r--gmp/mpn/cray/lshift.c27
-rw-r--r--gmp/mpn/cray/mulww.f55
-rw-r--r--gmp/mpn/cray/popcount.c25
-rw-r--r--gmp/mpn/cray/rshift.c27
-rw-r--r--gmp/mpn/cray/sub_n.c25
-rw-r--r--gmp/mpn/generic/add.c25
-rw-r--r--gmp/mpn/generic/add_1.c25
-rw-r--r--gmp/mpn/generic/add_err1_n.c101
-rw-r--r--gmp/mpn/generic/add_err2_n.c117
-rw-r--r--gmp/mpn/generic/add_err3_n.c132
-rw-r--r--gmp/mpn/generic/add_n.c35
-rw-r--r--gmp/mpn/generic/addmul_1.c28
-rw-r--r--gmp/mpn/generic/addsub_n.c (renamed from gmp/mpn/generic/add_n_sub_n.c)51
-rw-r--r--gmp/mpn/generic/bdiv_dbm1c.c25
-rw-r--r--gmp/mpn/generic/bdiv_q.c77
-rw-r--r--gmp/mpn/generic/bdiv_q_1.c126
-rw-r--r--gmp/mpn/generic/bdiv_qr.c85
-rw-r--r--gmp/mpn/generic/bdivmod.c124
-rw-r--r--gmp/mpn/generic/binvert.c91
-rw-r--r--gmp/mpn/generic/broot.c196
-rw-r--r--gmp/mpn/generic/brootinv.c140
-rw-r--r--gmp/mpn/generic/bsqrt.c48
-rw-r--r--gmp/mpn/generic/bsqrtinv.c105
-rw-r--r--gmp/mpn/generic/cmp.c25
-rw-r--r--gmp/mpn/generic/cnd_add_n.c70
-rw-r--r--gmp/mpn/generic/cnd_sub_n.c70
-rw-r--r--gmp/mpn/generic/com.c45
-rw-r--r--gmp/mpn/generic/comb_tables.c48
-rw-r--r--gmp/mpn/generic/copyd.c41
-rw-r--r--gmp/mpn/generic/copyi.c43
-rw-r--r--gmp/mpn/generic/dc_bdiv_q.c137
-rw-r--r--gmp/mpn/generic/dc_bdiv_qr.c (renamed from gmp/mpn/generic/dcpi1_bdiv_qr.c)70
-rw-r--r--gmp/mpn/generic/dc_div_q.c57
-rw-r--r--gmp/mpn/generic/dc_div_qr.c203
-rw-r--r--gmp/mpn/generic/dc_divappr_q.c196
-rw-r--r--gmp/mpn/generic/dc_divrem_n.c121
-rw-r--r--gmp/mpn/generic/dcpi1_bdiv_q.c160
-rw-r--r--gmp/mpn/generic/dcpi1_div_q.c87
-rw-r--r--gmp/mpn/generic/dcpi1_div_qr.c249
-rw-r--r--gmp/mpn/generic/dcpi1_divappr_q.c257
-rw-r--r--gmp/mpn/generic/div_q.c323
-rw-r--r--gmp/mpn/generic/div_qr_1.c126
-rw-r--r--gmp/mpn/generic/div_qr_1n_pi1.c277
-rw-r--r--gmp/mpn/generic/div_qr_1n_pi2.c195
-rw-r--r--gmp/mpn/generic/div_qr_1u_pi2.c228
-rw-r--r--gmp/mpn/generic/div_qr_2.c332
-rw-r--r--gmp/mpn/generic/div_qr_2n_pi1.c85
-rw-r--r--gmp/mpn/generic/div_qr_2u_pi1.c77
-rw-r--r--gmp/mpn/generic/dive_1.c60
-rw-r--r--gmp/mpn/generic/diveby3.c27
-rw-r--r--gmp/mpn/generic/divexact.c138
-rw-r--r--gmp/mpn/generic/divis.c175
-rw-r--r--gmp/mpn/generic/divrem.c43
-rw-r--r--gmp/mpn/generic/divrem_1.c60
-rw-r--r--gmp/mpn/generic/divrem_2.c186
-rw-r--r--gmp/mpn/generic/dump.c27
-rw-r--r--gmp/mpn/generic/fib2_ui.c108
-rw-r--r--gmp/mpn/generic/gcd.c312
-rw-r--r--gmp/mpn/generic/gcd_1.c100
-rw-r--r--gmp/mpn/generic/gcd_lehmer.c160
-rw-r--r--gmp/mpn/generic/gcd_subdiv_step.c209
-rw-r--r--gmp/mpn/generic/gcdext.c168
-rw-r--r--gmp/mpn/generic/gcdext_1.c265
-rw-r--r--gmp/mpn/generic/gcdext_lehmer.c168
-rw-r--r--gmp/mpn/generic/gcdext_subdiv_step.c197
-rw-r--r--gmp/mpn/generic/get_d.c564
-rw-r--r--gmp/mpn/generic/get_str.c47
-rw-r--r--gmp/mpn/generic/gmp-mparam.h27
-rw-r--r--gmp/mpn/generic/hgcd.c632
-rw-r--r--gmp/mpn/generic/hgcd2.c62
-rw-r--r--gmp/mpn/generic/hgcd2_jacobi.c366
-rw-r--r--gmp/mpn/generic/hgcd_appr.c268
-rw-r--r--gmp/mpn/generic/hgcd_jacobi.c244
-rw-r--r--gmp/mpn/generic/hgcd_matrix.c266
-rw-r--r--gmp/mpn/generic/hgcd_reduce.c247
-rw-r--r--gmp/mpn/generic/hgcd_step.c128
-rw-r--r--gmp/mpn/generic/invert.c113
-rw-r--r--gmp/mpn/generic/invertappr.c314
-rw-r--r--gmp/mpn/generic/jacbase.c117
-rw-r--r--gmp/mpn/generic/jacobi.c295
-rw-r--r--gmp/mpn/generic/jacobi_2.c352
-rw-r--r--gmp/mpn/generic/logops_n.c78
-rw-r--r--gmp/mpn/generic/lshift.c28
-rw-r--r--gmp/mpn/generic/lshiftc.c74
-rw-r--r--gmp/mpn/generic/matrix22_mul.c258
-rw-r--r--gmp/mpn/generic/matrix22_mul1_inverse_vector.c65
-rw-r--r--gmp/mpn/generic/mod_1.c136
-rw-r--r--gmp/mpn/generic/mod_1_1.c310
-rw-r--r--gmp/mpn/generic/mod_1_2.c96
-rw-r--r--gmp/mpn/generic/mod_1_3.c123
-rw-r--r--gmp/mpn/generic/mod_1_4.c145
-rw-r--r--gmp/mpn/generic/mod_34lsub1.c27
-rw-r--r--gmp/mpn/generic/mode1o.c29
-rw-r--r--gmp/mpn/generic/mu_bdiv_q.c234
-rw-r--r--gmp/mpn/generic/mu_bdiv_qr.c290
-rw-r--r--gmp/mpn/generic/mu_div_q.c192
-rw-r--r--gmp/mpn/generic/mu_div_qr.c405
-rw-r--r--gmp/mpn/generic/mu_divappr_q.c329
-rw-r--r--gmp/mpn/generic/mul.c410
-rw-r--r--gmp/mpn/generic/mul_1.c28
-rw-r--r--gmp/mpn/generic/mul_basecase.c31
-rw-r--r--gmp/mpn/generic/mul_fft.c484
-rw-r--r--gmp/mpn/generic/mul_n.c803
-rw-r--r--gmp/mpn/generic/mullo_basecase.c52
-rw-r--r--gmp/mpn/generic/mullo_n.c256
-rw-r--r--gmp/mpn/generic/mullow_basecase.c41
-rw-r--r--gmp/mpn/generic/mullow_n.c111
-rw-r--r--gmp/mpn/generic/mulmid.c256
-rw-r--r--gmp/mpn/generic/mulmid_basecase.c83
-rw-r--r--gmp/mpn/generic/mulmid_n.c62
-rw-r--r--gmp/mpn/generic/mulmod_bnm1.c355
-rw-r--r--gmp/mpn/generic/neg.c34
-rw-r--r--gmp/mpn/generic/neg_n.c23
-rw-r--r--gmp/mpn/generic/nussbaumer_mul.c71
-rw-r--r--gmp/mpn/generic/perfpow.c417
-rw-r--r--gmp/mpn/generic/perfsqr.c61
-rw-r--r--gmp/mpn/generic/popham.c34
-rw-r--r--gmp/mpn/generic/pow_1.c48
-rw-r--r--gmp/mpn/generic/powlo.c72
-rw-r--r--gmp/mpn/generic/powm.c546
-rw-r--r--gmp/mpn/generic/powm_sec.c272
-rw-r--r--gmp/mpn/generic/pre_divrem_1.c35
-rw-r--r--gmp/mpn/generic/pre_mod_1.c34
-rw-r--r--gmp/mpn/generic/random.c25
-rw-r--r--gmp/mpn/generic/random2.c36
-rw-r--r--gmp/mpn/generic/redc_1.c40
-rw-r--r--gmp/mpn/generic/redc_2.c41
-rw-r--r--gmp/mpn/generic/redc_n.c81
-rw-r--r--gmp/mpn/generic/remove.c172
-rw-r--r--gmp/mpn/generic/rootrem.c152
-rw-r--r--gmp/mpn/generic/rshift.c28
-rw-r--r--gmp/mpn/generic/sb_bdiv_q.c91
-rw-r--r--gmp/mpn/generic/sb_bdiv_qr.c (renamed from gmp/mpn/generic/sbpi1_bdiv_qr.c)51
-rw-r--r--gmp/mpn/generic/sb_div_q.c240
-rw-r--r--gmp/mpn/generic/sb_div_qr.c91
-rw-r--r--gmp/mpn/generic/sb_divappr_q.c136
-rw-r--r--gmp/mpn/generic/sb_divrem_mn.c205
-rw-r--r--gmp/mpn/generic/sbpi1_bdiv_q.c100
-rw-r--r--gmp/mpn/generic/sbpi1_div_q.c303
-rw-r--r--gmp/mpn/generic/sbpi1_div_qr.c110
-rw-r--r--gmp/mpn/generic/sbpi1_divappr_q.c199
-rw-r--r--gmp/mpn/generic/scan0.c30
-rw-r--r--gmp/mpn/generic/scan1.c30
-rw-r--r--gmp/mpn/generic/sec_aors_1.c60
-rw-r--r--gmp/mpn/generic/sec_div.c133
-rw-r--r--gmp/mpn/generic/sec_invert.c195
-rw-r--r--gmp/mpn/generic/sec_mul.c49
-rw-r--r--gmp/mpn/generic/sec_pi1_div.c173
-rw-r--r--gmp/mpn/generic/sec_powm.c438
-rw-r--r--gmp/mpn/generic/sec_sqr.c48
-rw-r--r--gmp/mpn/generic/sec_tabselect.c55
-rw-r--r--gmp/mpn/generic/set_str.c54
-rw-r--r--gmp/mpn/generic/sizeinbase.c54
-rw-r--r--gmp/mpn/generic/sqr.c99
-rw-r--r--gmp/mpn/generic/sqr_basecase.c95
-rw-r--r--gmp/mpn/generic/sqrmod_bnm1.c313
-rw-r--r--gmp/mpn/generic/sqrtrem.c141
-rw-r--r--gmp/mpn/generic/sub.c25
-rw-r--r--gmp/mpn/generic/sub_1.c25
-rw-r--r--gmp/mpn/generic/sub_err1_n.c101
-rw-r--r--gmp/mpn/generic/sub_err2_n.c117
-rw-r--r--gmp/mpn/generic/sub_err3_n.c132
-rw-r--r--gmp/mpn/generic/sub_n.c35
-rw-r--r--gmp/mpn/generic/subcnd_n.c85
-rw-r--r--gmp/mpn/generic/submul_1.c28
-rw-r--r--gmp/mpn/generic/tdiv_qr.c144
-rw-r--r--gmp/mpn/generic/toom22_mul.c53
-rw-r--r--gmp/mpn/generic/toom2_sqr.c49
-rw-r--r--gmp/mpn/generic/toom32_mul.c316
-rw-r--r--gmp/mpn/generic/toom33_mul.c143
-rw-r--r--gmp/mpn/generic/toom3_sqr.c127
-rw-r--r--gmp/mpn/generic/toom42_mul.c97
-rw-r--r--gmp/mpn/generic/toom42_mulmid.c238
-rw-r--r--gmp/mpn/generic/toom43_mul.c234
-rw-r--r--gmp/mpn/generic/toom44_mul.c348
-rw-r--r--gmp/mpn/generic/toom4_sqr.c213
-rw-r--r--gmp/mpn/generic/toom52_mul.c257
-rw-r--r--gmp/mpn/generic/toom53_mul.c286
-rw-r--r--gmp/mpn/generic/toom54_mul.c143
-rw-r--r--gmp/mpn/generic/toom62_mul.c291
-rw-r--r--gmp/mpn/generic/toom63_mul.c232
-rw-r--r--gmp/mpn/generic/toom6_sqr.c182
-rw-r--r--gmp/mpn/generic/toom6h_mul.c263
-rw-r--r--gmp/mpn/generic/toom8_sqr.c226
-rw-r--r--gmp/mpn/generic/toom8h_mul.c306
-rw-r--r--gmp/mpn/generic/toom_couple_handling.c81
-rw-r--r--gmp/mpn/generic/toom_eval_dgr3_pm1.c73
-rw-r--r--gmp/mpn/generic/toom_eval_dgr3_pm2.c98
-rw-r--r--gmp/mpn/generic/toom_eval_pm1.c90
-rw-r--r--gmp/mpn/generic/toom_eval_pm2.c131
-rw-r--r--gmp/mpn/generic/toom_eval_pm2exp.c128
-rw-r--r--gmp/mpn/generic/toom_eval_pm2rexp.c102
-rw-r--r--gmp/mpn/generic/toom_interpolate_12pts.c361
-rw-r--r--gmp/mpn/generic/toom_interpolate_16pts.c527
-rw-r--r--gmp/mpn/generic/toom_interpolate_5pts.c149
-rw-r--r--gmp/mpn/generic/toom_interpolate_6pts.c240
-rw-r--r--gmp/mpn/generic/toom_interpolate_7pts.c239
-rw-r--r--gmp/mpn/generic/toom_interpolate_8pts.c212
-rw-r--r--gmp/mpn/generic/trialdiv.c132
-rw-r--r--gmp/mpn/generic/udiv_w_sdiv.c42
-rw-r--r--gmp/mpn/generic/zero.c42
-rw-r--r--gmp/mpn/i960/README9
-rw-r--r--gmp/mpn/i960/add_n.s41
-rw-r--r--gmp/mpn/i960/addmul_1.s46
-rw-r--r--gmp/mpn/i960/mul_1.s43
-rw-r--r--gmp/mpn/i960/sub_n.s41
-rw-r--r--gmp/mpn/ia64/README38
-rw-r--r--gmp/mpn/ia64/add_n_sub_n.asm309
-rw-r--r--gmp/mpn/ia64/addmul_1.asm36
-rw-r--r--gmp/mpn/ia64/addmul_2.asm1107
-rw-r--r--gmp/mpn/ia64/aors_n.asm1343
-rw-r--r--gmp/mpn/ia64/aorslsh1_n.asm323
-rw-r--r--gmp/mpn/ia64/aorsorrlsh1_n.asm48
-rw-r--r--gmp/mpn/ia64/aorsorrlsh2_n.asm48
-rw-r--r--gmp/mpn/ia64/aorsorrlshC_n.asm397
-rw-r--r--gmp/mpn/ia64/bdiv_dbm1c.asm33
-rw-r--r--gmp/mpn/ia64/cnd_aors_n.asm259
-rw-r--r--gmp/mpn/ia64/copyd.asm33
-rw-r--r--gmp/mpn/ia64/copyi.asm33
-rw-r--r--gmp/mpn/ia64/dive_1.asm47
-rw-r--r--gmp/mpn/ia64/divrem_1.asm33
-rw-r--r--gmp/mpn/ia64/divrem_2.asm466
-rw-r--r--gmp/mpn/ia64/gcd_1.asm169
-rw-r--r--gmp/mpn/ia64/gmp-mparam.h258
-rw-r--r--gmp/mpn/ia64/hamdist.asm38
-rw-r--r--gmp/mpn/ia64/ia64-defs.m449
-rw-r--r--gmp/mpn/ia64/invert_limb.asm35
-rw-r--r--gmp/mpn/ia64/logops_n.asm33
-rw-r--r--gmp/mpn/ia64/lorrshift.asm452
-rw-r--r--gmp/mpn/ia64/lshiftc.asm463
-rw-r--r--gmp/mpn/ia64/mod_34lsub1.asm236
-rw-r--r--gmp/mpn/ia64/mode1o.asm37
-rw-r--r--gmp/mpn/ia64/mul_1.asm37
-rw-r--r--gmp/mpn/ia64/mul_2.asm1021
-rw-r--r--gmp/mpn/ia64/popcount.asm37
-rw-r--r--gmp/mpn/ia64/rsh1aors_n.asm37
-rw-r--r--gmp/mpn/ia64/sec_tabselect.asm150
-rw-r--r--gmp/mpn/ia64/sqr_diag_addlsh1.asm144
-rw-r--r--gmp/mpn/ia64/sqr_diagonal.asm79
-rw-r--r--gmp/mpn/ia64/submul_1.asm35
-rw-r--r--gmp/mpn/lisp/gmpasm-mode.el43
-rwxr-xr-xgmp/mpn/m4-ccas33
-rw-r--r--gmp/mpn/m68k/README25
-rw-r--r--gmp/mpn/m68k/aors_n.asm36
-rw-r--r--gmp/mpn/m68k/gmp-mparam.h37
-rw-r--r--gmp/mpn/m68k/lshift.asm42
-rw-r--r--gmp/mpn/m68k/m68k-defs.m435
-rw-r--r--gmp/mpn/m68k/mc68020/aorsmul_1.asm37
-rw-r--r--gmp/mpn/m68k/mc68020/mul_1.asm36
-rw-r--r--gmp/mpn/m68k/mc68020/udiv.asm35
-rw-r--r--gmp/mpn/m68k/mc68020/umul.asm35
-rw-r--r--gmp/mpn/m68k/rshift.asm52
-rw-r--r--gmp/mpn/m68k/t-m68k-defs.pl33
-rw-r--r--gmp/mpn/m88k/README25
-rw-r--r--gmp/mpn/m88k/add_n.s39
-rw-r--r--gmp/mpn/m88k/mc88110/add_n.S39
-rw-r--r--gmp/mpn/m88k/mc88110/addmul_1.s39
-rw-r--r--gmp/mpn/m88k/mc88110/mul_1.s39
-rw-r--r--gmp/mpn/m88k/mc88110/sub_n.S39
-rw-r--r--gmp/mpn/m88k/mul_1.s39
-rw-r--r--gmp/mpn/m88k/sub_n.s39
-rw-r--r--gmp/mpn/minithres/gmp-mparam.h135
-rw-r--r--gmp/mpn/mips32/add_n.asm31
-rw-r--r--gmp/mpn/mips32/addmul_1.asm31
-rw-r--r--gmp/mpn/mips32/gmp-mparam.h40
-rw-r--r--gmp/mpn/mips32/lshift.asm31
-rw-r--r--gmp/mpn/mips32/mips-defs.m427
-rw-r--r--gmp/mpn/mips32/mips.m427
-rw-r--r--gmp/mpn/mips32/mul_1.asm31
-rw-r--r--gmp/mpn/mips32/rshift.asm31
-rw-r--r--gmp/mpn/mips32/sub_n.asm31
-rw-r--r--gmp/mpn/mips32/submul_1.asm31
-rw-r--r--gmp/mpn/mips32/umul.asm31
-rw-r--r--gmp/mpn/mips64/README27
-rw-r--r--gmp/mpn/mips64/add_n.asm46
-rw-r--r--gmp/mpn/mips64/addmul_1.asm34
-rw-r--r--gmp/mpn/mips64/gmp-mparam.h40
-rw-r--r--gmp/mpn/mips64/lshift.asm33
-rw-r--r--gmp/mpn/mips64/mul_1.asm34
-rw-r--r--gmp/mpn/mips64/rshift.asm33
-rw-r--r--gmp/mpn/mips64/sqr_diagonal.asm31
-rw-r--r--gmp/mpn/mips64/sub_n.asm46
-rw-r--r--gmp/mpn/mips64/submul_1.asm34
-rw-r--r--gmp/mpn/mips64/umul.asm31
-rw-r--r--gmp/mpn/ns32k/add_n.s44
-rw-r--r--gmp/mpn/ns32k/addmul_1.s46
-rw-r--r--gmp/mpn/ns32k/mul_1.s45
-rw-r--r--gmp/mpn/ns32k/sub_n.s44
-rw-r--r--gmp/mpn/ns32k/submul_1.s46
-rw-r--r--gmp/mpn/pa32/README25
-rw-r--r--gmp/mpn/pa32/add_n.asm33
-rw-r--r--gmp/mpn/pa32/gmp-mparam.h50
-rw-r--r--gmp/mpn/pa32/hppa1_1/addmul_1.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/gmp-mparam.h38
-rw-r--r--gmp/mpn/pa32/hppa1_1/mul_1.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm33
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm33
-rw-r--r--gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm31
-rw-r--r--gmp/mpn/pa32/hppa1_1/submul_1.asm34
-rw-r--r--gmp/mpn/pa32/hppa1_1/udiv.asm31
-rw-r--r--gmp/mpn/pa32/hppa1_1/umul.asm31
-rw-r--r--gmp/mpn/pa32/hppa2_0/add_n.asm33
-rw-r--r--gmp/mpn/pa32/hppa2_0/gmp-mparam.h222
-rw-r--r--gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm31
-rw-r--r--gmp/mpn/pa32/hppa2_0/sub_n.asm33
-rw-r--r--gmp/mpn/pa32/lshift.asm33
-rw-r--r--gmp/mpn/pa32/pa-defs.m435
-rw-r--r--gmp/mpn/pa32/rshift.asm33
-rw-r--r--gmp/mpn/pa32/sub_n.asm33
-rw-r--r--gmp/mpn/pa32/udiv.asm33
-rw-r--r--gmp/mpn/pa64/README25
-rw-r--r--gmp/mpn/pa64/add_n.asm93
-rw-r--r--gmp/mpn/pa64/addmul_1.asm33
-rw-r--r--gmp/mpn/pa64/aors_n.asm130
-rw-r--r--gmp/mpn/pa64/aorslsh1_n.asm31
-rw-r--r--gmp/mpn/pa64/gmp-mparam.h293
-rw-r--r--gmp/mpn/pa64/lshift.asm31
-rw-r--r--gmp/mpn/pa64/mul_1.asm33
-rw-r--r--gmp/mpn/pa64/rshift.asm31
-rw-r--r--gmp/mpn/pa64/sqr_diagonal.asm33
-rw-r--r--gmp/mpn/pa64/sub_n.asm93
-rw-r--r--gmp/mpn/pa64/submul_1.asm33
-rw-r--r--gmp/mpn/pa64/udiv.asm33
-rw-r--r--gmp/mpn/pa64/umul.asm34
-rw-r--r--gmp/mpn/power/add_n.asm34
-rw-r--r--gmp/mpn/power/addmul_1.asm33
-rw-r--r--gmp/mpn/power/gmp-mparam.h33
-rw-r--r--gmp/mpn/power/lshift.asm33
-rw-r--r--gmp/mpn/power/mul_1.asm33
-rw-r--r--gmp/mpn/power/rshift.asm33
-rw-r--r--gmp/mpn/power/sdiv.asm31
-rw-r--r--gmp/mpn/power/sub_n.asm34
-rw-r--r--gmp/mpn/power/submul_1.asm33
-rw-r--r--gmp/mpn/power/umul.asm31
-rw-r--r--gmp/mpn/powerpc32/750/com.asm79
-rw-r--r--gmp/mpn/powerpc32/750/com_n.asm68
-rw-r--r--gmp/mpn/powerpc32/750/gmp-mparam.h230
-rw-r--r--gmp/mpn/powerpc32/750/lshift.asm31
-rw-r--r--gmp/mpn/powerpc32/750/rshift.asm31
-rw-r--r--gmp/mpn/powerpc32/README25
-rw-r--r--gmp/mpn/powerpc32/addlsh1_n.asm31
-rw-r--r--gmp/mpn/powerpc32/addmul_1.asm34
-rw-r--r--gmp/mpn/powerpc32/aix.m435
-rw-r--r--gmp/mpn/powerpc32/aors_n.asm50
-rw-r--r--gmp/mpn/powerpc32/bdiv_dbm1c.asm31
-rw-r--r--gmp/mpn/powerpc32/darwin.m441
-rw-r--r--gmp/mpn/powerpc32/diveby3.asm33
-rw-r--r--gmp/mpn/powerpc32/divrem_2.asm41
-rw-r--r--gmp/mpn/powerpc32/eabi.m433
-rw-r--r--gmp/mpn/powerpc32/elf.m440
-rw-r--r--gmp/mpn/powerpc32/gmp-mparam.h256
-rw-r--r--gmp/mpn/powerpc32/invert_limb.asm142
-rw-r--r--gmp/mpn/powerpc32/lshift.asm38
-rw-r--r--gmp/mpn/powerpc32/lshiftc.asm168
-rw-r--r--gmp/mpn/powerpc32/mod_34lsub1.asm31
-rw-r--r--gmp/mpn/powerpc32/mode1o.asm33
-rw-r--r--gmp/mpn/powerpc32/mul_1.asm31
-rw-r--r--gmp/mpn/powerpc32/p3-p7/aors_n.asm186
-rw-r--r--gmp/mpn/powerpc32/p3/gmp-mparam.h155
-rw-r--r--gmp/mpn/powerpc32/p4/gmp-mparam.h204
-rw-r--r--gmp/mpn/powerpc32/p5/gmp-mparam.h156
-rw-r--r--gmp/mpn/powerpc32/p6/gmp-mparam.h165
-rw-r--r--gmp/mpn/powerpc32/p7/gmp-mparam.h159
-rw-r--r--gmp/mpn/powerpc32/powerpc-defs.m433
-rw-r--r--gmp/mpn/powerpc32/rshift.asm38
-rw-r--r--gmp/mpn/powerpc32/sec_tabselect.asm141
-rw-r--r--gmp/mpn/powerpc32/sqr_diag_addlsh1.asm80
-rw-r--r--gmp/mpn/powerpc32/sqr_diagonal.asm103
-rw-r--r--gmp/mpn/powerpc32/sublsh1_n.asm31
-rw-r--r--gmp/mpn/powerpc32/submul_1.asm31
-rw-r--r--gmp/mpn/powerpc32/umul.asm37
-rw-r--r--gmp/mpn/powerpc32/vmx/copyd.asm33
-rw-r--r--gmp/mpn/powerpc32/vmx/copyi.asm33
-rw-r--r--gmp/mpn/powerpc32/vmx/logops_n.asm31
-rw-r--r--gmp/mpn/powerpc32/vmx/mod_34lsub1.asm47
-rw-r--r--gmp/mpn/powerpc32/vmx/popcount.asm36
-rw-r--r--gmp/mpn/powerpc64/README31
-rw-r--r--gmp/mpn/powerpc64/aix.m449
-rw-r--r--gmp/mpn/powerpc64/com.asm136
-rw-r--r--gmp/mpn/powerpc64/com_n.asm74
-rw-r--r--gmp/mpn/powerpc64/copyd.asm40
-rw-r--r--gmp/mpn/powerpc64/copyi.asm40
-rw-r--r--gmp/mpn/powerpc64/darwin.m445
-rw-r--r--gmp/mpn/powerpc64/elf.m472
-rw-r--r--gmp/mpn/powerpc64/gmp-mparam.h63
-rw-r--r--gmp/mpn/powerpc64/logops_n.asm42
-rw-r--r--gmp/mpn/powerpc64/lshift.asm283
-rw-r--r--gmp/mpn/powerpc64/lshiftc.asm210
-rw-r--r--gmp/mpn/powerpc64/mode32/add_n.asm33
-rw-r--r--gmp/mpn/powerpc64/mode32/addmul_1.asm33
-rw-r--r--gmp/mpn/powerpc64/mode32/mul_1.asm33
-rw-r--r--gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h173
-rw-r--r--gmp/mpn/powerpc64/mode32/sqr_diagonal.asm117
-rw-r--r--gmp/mpn/powerpc64/mode32/sub_n.asm33
-rw-r--r--gmp/mpn/powerpc64/mode32/submul_1.asm33
-rw-r--r--gmp/mpn/powerpc64/mode64/addlsh1_n.asm82
-rw-r--r--gmp/mpn/powerpc64/mode64/addmul_1.asm185
-rw-r--r--gmp/mpn/powerpc64/mode64/aors_n.asm76
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsmul_1.asm225
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm187
-rw-r--r--gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm70
-rw-r--r--gmp/mpn/powerpc64/mode64/cnd_aors_n.asm196
-rw-r--r--gmp/mpn/powerpc64/mode64/dive_1.asm64
-rw-r--r--gmp/mpn/powerpc64/mode64/diveby3.asm83
-rw-r--r--gmp/mpn/powerpc64/mode64/divrem_1.asm136
-rw-r--r--gmp/mpn/powerpc64/mode64/divrem_2.asm51
-rw-r--r--gmp/mpn/powerpc64/mode64/gcd_1.asm122
-rw-r--r--gmp/mpn/powerpc64/mode64/gmp-mparam.h83
-rw-r--r--gmp/mpn/powerpc64/mode64/invert_limb.asm159
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_1_1.asm164
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_1_4.asm270
-rw-r--r--gmp/mpn/powerpc64/mode64/mod_34lsub1.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/mode1o.asm47
-rw-r--r--gmp/mpn/powerpc64/mode64/mul_1.asm59
-rw-r--r--gmp/mpn/powerpc64/mode64/mul_basecase.asm46
-rw-r--r--gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h179
-rw-r--r--gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h208
-rw-r--r--gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h219
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm183
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h160
-rw-r--r--gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm589
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aormul_2.asm135
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aors_n.asm128
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm43
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm129
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/gcd_1.asm110
-rw-r--r--gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h243
-rw-r--r--gmp/mpn/powerpc64/mode64/rsh1add_n.asm104
-rw-r--r--gmp/mpn/powerpc64/mode64/rsh1aors_n.asm172
-rw-r--r--gmp/mpn/powerpc64/mode64/rsh1sub_n.asm102
-rw-r--r--gmp/mpn/powerpc64/mode64/sqr_basecase.asm863
-rw-r--r--gmp/mpn/powerpc64/mode64/sublsh1_n.asm83
-rw-r--r--gmp/mpn/powerpc64/mode64/submul_1.asm62
-rw-r--r--gmp/mpn/powerpc64/p6/lshift.asm132
-rw-r--r--gmp/mpn/powerpc64/p6/lshiftc.asm136
-rw-r--r--gmp/mpn/powerpc64/p6/rshift.asm131
-rw-r--r--gmp/mpn/powerpc64/p7/copyd.asm128
-rw-r--r--gmp/mpn/powerpc64/p7/copyi.asm129
-rw-r--r--gmp/mpn/powerpc64/p7/hamdist.asm110
-rw-r--r--gmp/mpn/powerpc64/p7/popcount.asm90
-rw-r--r--gmp/mpn/powerpc64/rshift.asm264
-rw-r--r--gmp/mpn/powerpc64/sec_tabselect.asm147
-rw-r--r--gmp/mpn/powerpc64/sqr_diagonal.asm55
-rw-r--r--gmp/mpn/powerpc64/umul.asm37
-rw-r--r--gmp/mpn/powerpc64/vmx/popcount.asm192
-rw-r--r--gmp/mpn/pyr/add_n.s74
-rw-r--r--gmp/mpn/pyr/addmul_1.s43
-rw-r--r--gmp/mpn/pyr/mul_1.s40
-rw-r--r--gmp/mpn/pyr/sub_n.s74
-rw-r--r--gmp/mpn/s390/README (renamed from gmp/mpn/s390_32/README)0
-rw-r--r--gmp/mpn/s390/addmul_1.asm (renamed from gmp/mpn/s390_32/addmul_1.asm)31
-rw-r--r--gmp/mpn/s390/gmp-mparam.h54
-rw-r--r--gmp/mpn/s390/mul_1.asm (renamed from gmp/mpn/s390_32/mul_1.asm)31
-rw-r--r--gmp/mpn/s390/submul_1.asm (renamed from gmp/mpn/s390_32/submul_1.asm)31
-rw-r--r--gmp/mpn/s390_32/copyd.asm145
-rw-r--r--gmp/mpn/s390_32/copyi.asm69
-rw-r--r--gmp/mpn/s390_32/esame/addmul_1.asm72
-rw-r--r--gmp/mpn/s390_32/esame/aors_n.asm137
-rw-r--r--gmp/mpn/s390_32/esame/aorslsh1_n.asm173
-rw-r--r--gmp/mpn/s390_32/esame/bdiv_dbm1c.asm65
-rw-r--r--gmp/mpn/s390_32/esame/gmp-mparam.h207
-rw-r--r--gmp/mpn/s390_32/esame/mul_1.asm66
-rw-r--r--gmp/mpn/s390_32/esame/mul_basecase.asm130
-rw-r--r--gmp/mpn/s390_32/esame/sqr_basecase.asm203
-rw-r--r--gmp/mpn/s390_32/esame/submul_1.asm70
-rw-r--r--gmp/mpn/s390_32/gmp-mparam.h138
-rw-r--r--gmp/mpn/s390_32/logops_n.asm295
-rw-r--r--gmp/mpn/s390_32/lshift.asm144
-rw-r--r--gmp/mpn/s390_32/lshiftc.asm156
-rw-r--r--gmp/mpn/s390_32/rshift.asm138
-rw-r--r--gmp/mpn/s390_64/README88
-rw-r--r--gmp/mpn/s390_64/addmul_1.asm72
-rw-r--r--gmp/mpn/s390_64/aorrlsh1_n.asm168
-rw-r--r--gmp/mpn/s390_64/aors_n.asm136
-rw-r--r--gmp/mpn/s390_64/bdiv_dbm1c.asm65
-rw-r--r--gmp/mpn/s390_64/copyd.asm144
-rw-r--r--gmp/mpn/s390_64/copyi.asm68
-rw-r--r--gmp/mpn/s390_64/gmp-mparam.h175
-rw-r--r--gmp/mpn/s390_64/invert_limb.asm94
-rw-r--r--gmp/mpn/s390_64/logops_n.asm291
-rw-r--r--gmp/mpn/s390_64/lshift.asm196
-rw-r--r--gmp/mpn/s390_64/lshiftc.asm207
-rw-r--r--gmp/mpn/s390_64/mod_34lsub1.asm109
-rw-r--r--gmp/mpn/s390_64/mul_1.asm66
-rw-r--r--gmp/mpn/s390_64/mul_basecase.asm130
-rw-r--r--gmp/mpn/s390_64/rshift.asm195
-rw-r--r--gmp/mpn/s390_64/sqr_basecase.asm203
-rw-r--r--gmp/mpn/s390_64/sublsh1_n.asm169
-rw-r--r--gmp/mpn/s390_64/submul_1.asm70
-rw-r--r--gmp/mpn/s390_64/z10/gmp-mparam.h231
-rw-r--r--gmp/mpn/sh/add_n.asm59
-rw-r--r--gmp/mpn/sh/add_n.s45
-rw-r--r--gmp/mpn/sh/sh2/addmul_1.asm65
-rw-r--r--gmp/mpn/sh/sh2/addmul_1.s51
-rw-r--r--gmp/mpn/sh/sh2/mul_1.asm62
-rw-r--r--gmp/mpn/sh/sh2/mul_1.s48
-rw-r--r--gmp/mpn/sh/sh2/submul_1.asm65
-rw-r--r--gmp/mpn/sh/sh2/submul_1.s51
-rw-r--r--gmp/mpn/sh/sub_n.asm59
-rw-r--r--gmp/mpn/sh/sub_n.s45
-rw-r--r--gmp/mpn/sparc32/README25
-rw-r--r--gmp/mpn/sparc32/add_n.asm31
-rw-r--r--gmp/mpn/sparc32/addmul_1.asm33
-rw-r--r--gmp/mpn/sparc32/gmp-mparam.h34
-rw-r--r--gmp/mpn/sparc32/lshift.asm31
-rw-r--r--gmp/mpn/sparc32/mul_1.asm33
-rw-r--r--gmp/mpn/sparc32/rshift.asm31
-rw-r--r--gmp/mpn/sparc32/sparc-defs.m466
-rw-r--r--gmp/mpn/sparc32/sub_n.asm31
-rw-r--r--gmp/mpn/sparc32/submul_1.asm33
-rw-r--r--gmp/mpn/sparc32/udiv.asm31
-rw-r--r--gmp/mpn/sparc32/udiv_nfp.asm31
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/add_n.asm70
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/addmul_1.asm90
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h153
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/mul_1.asm83
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm55
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/sub_n.asm70
-rw-r--r--gmp/mpn/sparc32/ultrasparct1/submul_1.asm91
-rw-r--r--gmp/mpn/sparc32/umul.asm31
-rw-r--r--gmp/mpn/sparc32/v8/addmul_1.asm33
-rw-r--r--gmp/mpn/sparc32/v8/gmp-mparam.h38
-rw-r--r--gmp/mpn/sparc32/v8/mul_1.asm31
-rw-r--r--gmp/mpn/sparc32/v8/submul_1.asm33
-rw-r--r--gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h38
-rw-r--r--gmp/mpn/sparc32/v8/supersparc/udiv.asm31
-rw-r--r--gmp/mpn/sparc32/v8/udiv.asm31
-rw-r--r--gmp/mpn/sparc32/v8/umul.asm31
-rw-r--r--gmp/mpn/sparc32/v9/add_n.asm31
-rw-r--r--gmp/mpn/sparc32/v9/addmul_1.asm31
-rw-r--r--gmp/mpn/sparc32/v9/gmp-mparam.h257
-rw-r--r--gmp/mpn/sparc32/v9/mul_1.asm31
-rw-r--r--gmp/mpn/sparc32/v9/sqr_diagonal.asm31
-rw-r--r--gmp/mpn/sparc32/v9/sub_n.asm31
-rw-r--r--gmp/mpn/sparc32/v9/submul_1.asm31
-rw-r--r--gmp/mpn/sparc32/v9/udiv.asm31
-rw-r--r--gmp/mpn/sparc64/README29
-rw-r--r--gmp/mpn/sparc64/add_n.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/add_n.asm)137
-rw-r--r--gmp/mpn/sparc64/addmul_1.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm)44
-rw-r--r--gmp/mpn/sparc64/addmul_2.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm)31
-rw-r--r--gmp/mpn/sparc64/copyd.asm48
-rw-r--r--gmp/mpn/sparc64/copyi.asm48
-rw-r--r--gmp/mpn/sparc64/dive_1.c25
-rw-r--r--gmp/mpn/sparc64/divrem_1.c29
-rw-r--r--gmp/mpn/sparc64/gcd_1.asm135
-rw-r--r--gmp/mpn/sparc64/gmp-mparam.h201
-rw-r--r--gmp/mpn/sparc64/lshift.asm256
-rw-r--r--gmp/mpn/sparc64/lshiftc.asm147
-rw-r--r--gmp/mpn/sparc64/mod_1.c82
-rw-r--r--gmp/mpn/sparc64/mod_1_4.c236
-rw-r--r--gmp/mpn/sparc64/mode1o.c27
-rw-r--r--gmp/mpn/sparc64/mul_1.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/mul_1.asm)43
-rw-r--r--gmp/mpn/sparc64/rshift.asm255
-rw-r--r--gmp/mpn/sparc64/sec_tabselect.asm162
-rw-r--r--gmp/mpn/sparc64/sparc64.h43
-rw-r--r--gmp/mpn/sparc64/sqr_diagonal.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm)31
-rw-r--r--gmp/mpn/sparc64/sub_n.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/sub_n.asm)109
-rw-r--r--gmp/mpn/sparc64/submul_1.asm (renamed from gmp/mpn/sparc64/ultrasparc1234/submul_1.asm)33
-rw-r--r--gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm165
-rw-r--r--gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h219
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/add_n.asm68
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm69
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/addmul_1.asm86
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h154
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/mul_1.asm82
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm69
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/sub_n.asm68
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm41
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm69
-rw-r--r--gmp/mpn/sparc64/ultrasparct1/submul_1.asm86
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/add_n.asm126
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/addmul_1.asm182
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/aormul_2.asm228
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/aormul_4.asm219
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm147
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm147
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm143
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/dive_1.asm129
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/hamdist.asm78
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/invert_limb.asm92
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/missing.asm77
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/missing.m488
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm233
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm117
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/mode1o.asm82
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/mul_1.asm174
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/popcount.asm70
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm93
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/sub_n.asm144
-rw-r--r--gmp/mpn/sparc64/ultrasparct3/submul_1.asm170
-rw-r--r--gmp/mpn/thumb/add_n.asm63
-rw-r--r--gmp/mpn/thumb/add_n.s48
-rw-r--r--gmp/mpn/thumb/sub_n.asm63
-rw-r--r--gmp/mpn/thumb/sub_n.s48
-rw-r--r--gmp/mpn/vax/add_n.asm64
-rw-r--r--gmp/mpn/vax/add_n.s59
-rw-r--r--gmp/mpn/vax/addmul_1.asm124
-rw-r--r--gmp/mpn/vax/addmul_1.s124
-rw-r--r--gmp/mpn/vax/elf.m454
-rw-r--r--gmp/mpn/vax/gmp-mparam.h35
-rw-r--r--gmp/mpn/vax/lshift.asm59
-rw-r--r--gmp/mpn/vax/lshift.s56
-rw-r--r--gmp/mpn/vax/mul_1.asm118
-rw-r--r--gmp/mpn/vax/mul_1.s121
-rw-r--r--gmp/mpn/vax/rshift.asm57
-rw-r--r--gmp/mpn/vax/rshift.s54
-rw-r--r--gmp/mpn/vax/sub_n.asm64
-rw-r--r--gmp/mpn/vax/sub_n.s59
-rw-r--r--gmp/mpn/vax/submul_1.asm124
-rw-r--r--gmp/mpn/vax/submul_1.s124
-rw-r--r--gmp/mpn/x86/README27
-rw-r--r--gmp/mpn/x86/aors_n.asm50
-rw-r--r--gmp/mpn/x86/aorsmul_1.asm61
-rw-r--r--gmp/mpn/x86/atom/aorrlsh1_n.asm53
-rw-r--r--gmp/mpn/x86/atom/aorrlsh2_n.asm53
-rw-r--r--gmp/mpn/x86/atom/aorrlshC_n.asm156
-rw-r--r--gmp/mpn/x86/atom/aors_n.asm159
-rw-r--r--gmp/mpn/x86/atom/aorslshC_n.asm247
-rw-r--r--gmp/mpn/x86/atom/bdiv_q_1.asm35
-rw-r--r--gmp/mpn/x86/atom/cnd_add_n.asm113
-rw-r--r--gmp/mpn/x86/atom/cnd_sub_n.asm124
-rw-r--r--gmp/mpn/x86/atom/dive_1.asm34
-rw-r--r--gmp/mpn/x86/atom/gmp-mparam.h201
-rw-r--r--gmp/mpn/x86/atom/logops_n.asm151
-rw-r--r--gmp/mpn/x86/atom/lshift.asm218
-rw-r--r--gmp/mpn/x86/atom/lshiftc.asm159
-rw-r--r--gmp/mpn/x86/atom/mmx/copyd.asm34
-rw-r--r--gmp/mpn/x86/atom/mmx/copyi.asm34
-rw-r--r--gmp/mpn/x86/atom/mmx/hamdist.asm34
-rw-r--r--gmp/mpn/x86/atom/mod_34lsub1.asm34
-rw-r--r--gmp/mpn/x86/atom/mode1o.asm34
-rw-r--r--gmp/mpn/x86/atom/rshift.asm152
-rw-r--r--gmp/mpn/x86/atom/sse2/aorsmul_1.asm174
-rw-r--r--gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/divrem_1.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mod_1_1.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mod_1_4.asm34
-rw-r--r--gmp/mpn/x86/atom/sse2/mul_1.asm124
-rw-r--r--gmp/mpn/x86/atom/sse2/mul_basecase.asm501
-rw-r--r--gmp/mpn/x86/atom/sse2/popcount.asm35
-rw-r--r--gmp/mpn/x86/atom/sse2/sqr_basecase.asm634
-rw-r--r--gmp/mpn/x86/atom/sublsh1_n.asm34
-rw-r--r--gmp/mpn/x86/atom/sublsh2_n.asm57
-rw-r--r--gmp/mpn/x86/bd1/gmp-mparam.h208
-rw-r--r--gmp/mpn/x86/bd2/gmp-mparam.h209
-rw-r--r--gmp/mpn/x86/bdiv_dbm1c.asm71
-rw-r--r--gmp/mpn/x86/bdiv_q_1.asm208
-rw-r--r--gmp/mpn/x86/bobcat/gmp-mparam.h197
-rw-r--r--gmp/mpn/x86/cnd_aors_n.asm124
-rw-r--r--gmp/mpn/x86/copyd.asm45
-rw-r--r--gmp/mpn/x86/copyi.asm45
-rw-r--r--gmp/mpn/x86/core2/gmp-mparam.h200
-rw-r--r--gmp/mpn/x86/coreihwl/gmp-mparam.h210
-rw-r--r--gmp/mpn/x86/coreinhm/gmp-mparam.h224
-rw-r--r--gmp/mpn/x86/coreisbr/gmp-mparam.h203
-rw-r--r--gmp/mpn/x86/darwin.m482
-rw-r--r--gmp/mpn/x86/dive_1.asm35
-rw-r--r--gmp/mpn/x86/divrem_1.asm36
-rw-r--r--gmp/mpn/x86/divrem_2.asm35
-rw-r--r--gmp/mpn/x86/fat/com.c32
-rw-r--r--gmp/mpn/x86/fat/diveby3.c21
-rw-r--r--gmp/mpn/x86/fat/fat.c228
-rw-r--r--gmp/mpn/x86/fat/fat_entry.asm37
-rw-r--r--gmp/mpn/x86/fat/gcd_1.c25
-rw-r--r--gmp/mpn/x86/fat/gmp-mparam.h42
-rw-r--r--gmp/mpn/x86/fat/lshiftc.c32
-rw-r--r--gmp/mpn/x86/fat/mod_1.c32
-rw-r--r--gmp/mpn/x86/fat/mod_1_1.c36
-rw-r--r--gmp/mpn/x86/fat/mod_1_2.c36
-rw-r--r--gmp/mpn/x86/fat/mod_1_4.c36
-rw-r--r--gmp/mpn/x86/fat/mode1o.c25
-rw-r--r--gmp/mpn/x86/fat/mullo_basecase.c32
-rw-r--r--gmp/mpn/x86/fat/redc_1.c32
-rw-r--r--gmp/mpn/x86/fat/redc_2.c32
-rw-r--r--gmp/mpn/x86/geode/gmp-mparam.h141
-rw-r--r--gmp/mpn/x86/gmp-mparam.h31
-rw-r--r--gmp/mpn/x86/i486/gmp-mparam.h37
-rw-r--r--gmp/mpn/x86/k10/gmp-mparam.h211
-rw-r--r--gmp/mpn/x86/k6/README25
-rw-r--r--gmp/mpn/x86/k6/aors_n.asm35
-rw-r--r--gmp/mpn/x86/k6/aorsmul_1.asm56
-rwxr-xr-xgmp/mpn/x86/k6/cross.pl33
-rw-r--r--gmp/mpn/x86/k6/divrem_1.asm36
-rw-r--r--gmp/mpn/x86/k6/gcd_1.asm35
-rw-r--r--gmp/mpn/x86/k6/gmp-mparam.h202
-rw-r--r--gmp/mpn/x86/k6/k62mmx/copyd.asm33
-rw-r--r--gmp/mpn/x86/k6/k62mmx/lshift.asm33
-rw-r--r--gmp/mpn/x86/k6/k62mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/k6/mmx/com_n.asm (renamed from gmp/mpn/x86/k6/mmx/com.asm)41
-rw-r--r--gmp/mpn/x86/k6/mmx/dive_1.asm37
-rw-r--r--gmp/mpn/x86/k6/mmx/logops_n.asm35
-rw-r--r--gmp/mpn/x86/k6/mmx/lshift.asm33
-rw-r--r--gmp/mpn/x86/k6/mmx/popham.asm35
-rw-r--r--gmp/mpn/x86/k6/mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/k6/mod_34lsub1.asm35
-rw-r--r--gmp/mpn/x86/k6/mode1o.asm37
-rw-r--r--gmp/mpn/x86/k6/mul_1.asm47
-rw-r--r--gmp/mpn/x86/k6/mul_basecase.asm35
-rw-r--r--gmp/mpn/x86/k6/pre_mod_1.asm33
-rw-r--r--gmp/mpn/x86/k6/sqr_basecase.asm57
-rw-r--r--gmp/mpn/x86/k7/README25
-rw-r--r--gmp/mpn/x86/k7/addlsh1_n.asm196
-rw-r--r--gmp/mpn/x86/k7/aors_n.asm35
-rw-r--r--gmp/mpn/x86/k7/aorsmul_1.asm50
-rw-r--r--gmp/mpn/x86/k7/bdiv_q_1.asm244
-rw-r--r--gmp/mpn/x86/k7/dive_1.asm35
-rw-r--r--gmp/mpn/x86/k7/gcd_1.asm481
-rw-r--r--gmp/mpn/x86/k7/gmp-mparam.h292
-rw-r--r--gmp/mpn/x86/k7/invert_limb.asm193
-rw-r--r--gmp/mpn/x86/k7/mmx/com_n.asm (renamed from gmp/mpn/x86/k7/mmx/com.asm)39
-rw-r--r--gmp/mpn/x86/k7/mmx/copyd.asm33
-rw-r--r--gmp/mpn/x86/k7/mmx/copyi.asm33
-rw-r--r--gmp/mpn/x86/k7/mmx/divrem_1.asm49
-rw-r--r--gmp/mpn/x86/k7/mmx/lshift.asm35
-rw-r--r--gmp/mpn/x86/k7/mmx/mod_1.asm509
-rw-r--r--gmp/mpn/x86/k7/mmx/popham.asm37
-rw-r--r--gmp/mpn/x86/k7/mmx/rshift.asm35
-rw-r--r--gmp/mpn/x86/k7/mod_1_1.asm221
-rw-r--r--gmp/mpn/x86/k7/mod_1_4.asm260
-rw-r--r--gmp/mpn/x86/k7/mod_34lsub1.asm36
-rw-r--r--gmp/mpn/x86/k7/mode1o.asm37
-rw-r--r--gmp/mpn/x86/k7/mul_1.asm46
-rw-r--r--gmp/mpn/x86/k7/mul_basecase.asm35
-rw-r--r--gmp/mpn/x86/k7/sqr_basecase.asm47
-rw-r--r--gmp/mpn/x86/k7/sublsh1_n.asm173
-rw-r--r--gmp/mpn/x86/k8/gmp-mparam.h198
-rw-r--r--gmp/mpn/x86/lshift.asm48
-rw-r--r--gmp/mpn/x86/mmx/sec_tabselect.asm163
-rw-r--r--gmp/mpn/x86/mod_1.asm163
-rw-r--r--gmp/mpn/x86/mod_34lsub1.asm45
-rw-r--r--gmp/mpn/x86/mul_1.asm56
-rw-r--r--gmp/mpn/x86/mul_basecase.asm46
-rw-r--r--gmp/mpn/x86/nano/gmp-mparam.h162
-rw-r--r--gmp/mpn/x86/p6/README27
-rw-r--r--gmp/mpn/x86/p6/aors_n.asm37
-rw-r--r--gmp/mpn/x86/p6/aorsmul_1.asm53
-rw-r--r--gmp/mpn/x86/p6/bdiv_q_1.asm286
-rw-r--r--gmp/mpn/x86/p6/copyd.asm33
-rw-r--r--gmp/mpn/x86/p6/dive_1.asm37
-rw-r--r--gmp/mpn/x86/p6/gcd_1.asm156
-rw-r--r--gmp/mpn/x86/p6/gmp-mparam.h240
-rw-r--r--gmp/mpn/x86/p6/lshsub_n.asm29
-rw-r--r--gmp/mpn/x86/p6/mmx/divrem_1.asm35
-rw-r--r--gmp/mpn/x86/p6/mmx/gmp-mparam.h255
-rw-r--r--gmp/mpn/x86/p6/mmx/lshift.asm33
-rw-r--r--gmp/mpn/x86/p6/mmx/popham.asm33
-rw-r--r--gmp/mpn/x86/p6/mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/p6/mod_1.asm472
-rw-r--r--gmp/mpn/x86/p6/mod_34lsub1.asm35
-rw-r--r--gmp/mpn/x86/p6/mode1o.asm39
-rw-r--r--gmp/mpn/x86/p6/mul_basecase.asm35
-rw-r--r--gmp/mpn/x86/p6/p3mmx/popham.asm33
-rw-r--r--gmp/mpn/x86/p6/sqr_basecase.asm45
-rw-r--r--gmp/mpn/x86/p6/sse2/addmul_1.asm33
-rw-r--r--gmp/mpn/x86/p6/sse2/gmp-mparam.h233
-rw-r--r--gmp/mpn/x86/p6/sse2/mod_1_1.asm34
-rw-r--r--gmp/mpn/x86/p6/sse2/mod_1_4.asm34
-rw-r--r--gmp/mpn/x86/p6/sse2/mul_1.asm33
-rw-r--r--gmp/mpn/x86/p6/sse2/mul_basecase.asm33
-rw-r--r--gmp/mpn/x86/p6/sse2/popcount.asm33
-rw-r--r--gmp/mpn/x86/p6/sse2/sqr_basecase.asm33
-rw-r--r--gmp/mpn/x86/p6/sse2/submul_1.asm33
-rw-r--r--gmp/mpn/x86/pentium/README27
-rw-r--r--gmp/mpn/x86/pentium/aors_n.asm40
-rw-r--r--gmp/mpn/x86/pentium/aorsmul_1.asm33
-rw-r--r--gmp/mpn/x86/pentium/bdiv_q_1.asm260
-rw-r--r--gmp/mpn/x86/pentium/com_n.asm (renamed from gmp/mpn/x86/pentium/com.asm)39
-rw-r--r--gmp/mpn/x86/pentium/copyd.asm33
-rw-r--r--gmp/mpn/x86/pentium/copyi.asm33
-rw-r--r--gmp/mpn/x86/pentium/dive_1.asm35
-rw-r--r--gmp/mpn/x86/pentium/gmp-mparam.h38
-rw-r--r--gmp/mpn/x86/pentium/hamdist.asm33
-rw-r--r--gmp/mpn/x86/pentium/logops_n.asm33
-rw-r--r--gmp/mpn/x86/pentium/lshift.asm36
-rw-r--r--gmp/mpn/x86/pentium/mmx/gmp-mparam.h194
-rw-r--r--gmp/mpn/x86/pentium/mmx/hamdist.asm33
-rw-r--r--gmp/mpn/x86/pentium/mmx/lshift.asm35
-rw-r--r--gmp/mpn/x86/pentium/mmx/mul_1.asm35
-rw-r--r--gmp/mpn/x86/pentium/mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/pentium/mod_1.asm454
-rw-r--r--gmp/mpn/x86/pentium/mod_34lsub1.asm35
-rw-r--r--gmp/mpn/x86/pentium/mode1o.asm37
-rw-r--r--gmp/mpn/x86/pentium/mul_1.asm33
-rw-r--r--gmp/mpn/x86/pentium/mul_2.asm33
-rw-r--r--gmp/mpn/x86/pentium/mul_basecase.asm35
-rw-r--r--gmp/mpn/x86/pentium/popcount.asm33
-rw-r--r--gmp/mpn/x86/pentium/rshift.asm36
-rw-r--r--gmp/mpn/x86/pentium/sqr_basecase.asm35
-rw-r--r--gmp/mpn/x86/pentium4/README25
-rw-r--r--gmp/mpn/x86/pentium4/copyd.asm36
-rw-r--r--gmp/mpn/x86/pentium4/copyi.asm36
-rw-r--r--gmp/mpn/x86/pentium4/mmx/lshift.asm33
-rw-r--r--gmp/mpn/x86/pentium4/mmx/popham.asm35
-rw-r--r--gmp/mpn/x86/pentium4/mmx/rshift.asm33
-rw-r--r--gmp/mpn/x86/pentium4/sse2/add_n.asm79
-rw-r--r--gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm76
-rw-r--r--gmp/mpn/x86/pentium4/sse2/addmul_1.asm64
-rw-r--r--gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm141
-rw-r--r--gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm233
-rw-r--r--gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm95
-rw-r--r--gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm114
-rw-r--r--gmp/mpn/x86/pentium4/sse2/dive_1.asm49
-rw-r--r--gmp/mpn/x86/pentium4/sse2/divrem_1.asm36
-rw-r--r--gmp/mpn/x86/pentium4/sse2/gmp-mparam.h252
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1.asm391
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1_1.asm166
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_1_4.asm269
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm35
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mode1o.asm49
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mul_1.asm64
-rw-r--r--gmp/mpn/x86/pentium4/sse2/mul_basecase.asm27
-rw-r--r--gmp/mpn/x86/pentium4/sse2/popcount.asm76
-rw-r--r--gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm35
-rw-r--r--gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm29
-rw-r--r--gmp/mpn/x86/pentium4/sse2/sub_n.asm82
-rw-r--r--gmp/mpn/x86/pentium4/sse2/submul_1.asm176
-rw-r--r--gmp/mpn/x86/rshift.asm48
-rw-r--r--gmp/mpn/x86/sec_tabselect.asm115
-rw-r--r--gmp/mpn/x86/sqr_basecase.asm37
-rwxr-xr-xgmp/mpn/x86/t-zdisp.sh33
-rwxr-xr-xgmp/mpn/x86/t-zdisp2.pl41
-rw-r--r--gmp/mpn/x86/udiv.asm33
-rw-r--r--gmp/mpn/x86/umul.asm33
-rw-r--r--gmp/mpn/x86/x86-defs.m487
-rw-r--r--gmp/mpn/x86_64/README25
-rw-r--r--gmp/mpn/x86_64/addaddmul_1msb0.asm53
-rw-r--r--gmp/mpn/x86_64/addlsh1_n.asm (renamed from gmp/mpn/x86_64/aorrlsh1_n.asm)97
-rw-r--r--gmp/mpn/x86_64/addmul_2.asm175
-rw-r--r--gmp/mpn/x86_64/aorrlsh2_n.asm53
-rw-r--r--gmp/mpn/x86_64/aorrlshC_n.asm160
-rw-r--r--gmp/mpn/x86_64/aorrlsh_n.asm159
-rw-r--r--gmp/mpn/x86_64/aors_err1_n.asm225
-rw-r--r--gmp/mpn/x86_64/aors_err2_n.asm172
-rw-r--r--gmp/mpn/x86_64/aors_err3_n.asm156
-rw-r--r--gmp/mpn/x86_64/aors_n.asm149
-rw-r--r--gmp/mpn/x86_64/aorsmul_1.asm99
-rw-r--r--gmp/mpn/x86_64/atom/addmul_2.asm186
-rw-r--r--gmp/mpn/x86_64/atom/aorrlsh1_n.asm238
-rw-r--r--gmp/mpn/x86_64/atom/aorrlsh2_n.asm191
-rw-r--r--gmp/mpn/x86_64/atom/aors_n.asm155
-rw-r--r--gmp/mpn/x86_64/atom/aorsmul_1.asm190
-rw-r--r--gmp/mpn/x86_64/atom/com.asm37
-rw-r--r--gmp/mpn/x86_64/atom/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/atom/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/atom/dive_1.asm37
-rw-r--r--gmp/mpn/x86_64/atom/gmp-mparam.h276
-rw-r--r--gmp/mpn/x86_64/atom/lshift.asm123
-rw-r--r--gmp/mpn/x86_64/atom/lshiftc.asm127
-rw-r--r--gmp/mpn/x86_64/atom/mul_1.asm143
-rw-r--r--gmp/mpn/x86_64/atom/mul_2.asm186
-rw-r--r--gmp/mpn/x86_64/atom/popcount.asm35
-rw-r--r--gmp/mpn/x86_64/atom/redc_1.asm574
-rw-r--r--gmp/mpn/x86_64/atom/rsh1aors_n.asm287
-rw-r--r--gmp/mpn/x86_64/atom/rshift.asm121
-rw-r--r--gmp/mpn/x86_64/atom/sublsh1_n.asm242
-rw-r--r--gmp/mpn/x86_64/bd1/README11
-rw-r--r--gmp/mpn/x86_64/bd1/aorrlsh1_n.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/aorsmul_1.asm181
-rw-r--r--gmp/mpn/x86_64/bd1/com.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/gcd_1.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/gmp-mparam.h236
-rw-r--r--gmp/mpn/x86_64/bd1/hamdist.asm38
-rw-r--r--gmp/mpn/x86_64/bd1/mul_1.asm184
-rw-r--r--gmp/mpn/x86_64/bd1/mul_2.asm192
-rw-r--r--gmp/mpn/x86_64/bd1/mul_basecase.asm416
-rw-r--r--gmp/mpn/x86_64/bd1/popcount.asm38
-rw-r--r--gmp/mpn/x86_64/bd1/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/bd1/sublsh1_n.asm37
-rw-r--r--gmp/mpn/x86_64/bd2/gmp-mparam.h237
-rw-r--r--gmp/mpn/x86_64/bdiv_dbm1c.asm146
-rw-r--r--gmp/mpn/x86_64/bdiv_q_1.asm167
-rw-r--r--gmp/mpn/x86_64/bobcat/aors_n.asm150
-rw-r--r--gmp/mpn/x86_64/bobcat/aorsmul_1.asm183
-rw-r--r--gmp/mpn/x86_64/bobcat/copyd.asm91
-rw-r--r--gmp/mpn/x86_64/bobcat/copyi.asm94
-rw-r--r--gmp/mpn/x86_64/bobcat/gmp-mparam.h208
-rw-r--r--gmp/mpn/x86_64/bobcat/mul_1.asm187
-rw-r--r--gmp/mpn/x86_64/bobcat/mul_basecase.asm486
-rw-r--r--gmp/mpn/x86_64/bobcat/redc_1.asm502
-rw-r--r--gmp/mpn/x86_64/bobcat/sqr_basecase.asm565
-rw-r--r--gmp/mpn/x86_64/cnd_aors_n.asm183
-rw-r--r--gmp/mpn/x86_64/com.asm95
-rw-r--r--gmp/mpn/x86_64/com_n.asm77
-rw-r--r--gmp/mpn/x86_64/copyd.asm119
-rw-r--r--gmp/mpn/x86_64/copyi.asm117
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh1_n.asm53
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh2_n.asm53
-rw-r--r--gmp/mpn/x86_64/core2/aorrlsh_n.asm38
-rw-r--r--gmp/mpn/x86_64/core2/aors_err1_n.asm225
-rw-r--r--gmp/mpn/x86_64/core2/aors_n.asm130
-rw-r--r--gmp/mpn/x86_64/core2/aorslsh1_n.asm (renamed from gmp/mpn/x86_64/core2/sublshC_n.asm)85
-rw-r--r--gmp/mpn/x86_64/core2/aorsmul_1.asm193
-rw-r--r--gmp/mpn/x86_64/core2/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/core2/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/core2/divrem_1.asm237
-rw-r--r--gmp/mpn/x86_64/core2/gcd_1.asm144
-rw-r--r--gmp/mpn/x86_64/core2/gmp-mparam.h275
-rw-r--r--gmp/mpn/x86_64/core2/lshift.asm100
-rw-r--r--gmp/mpn/x86_64/core2/lshiftc.asm159
-rw-r--r--gmp/mpn/x86_64/core2/mul_basecase.asm975
-rw-r--r--gmp/mpn/x86_64/core2/mullo_basecase.asm427
-rw-r--r--gmp/mpn/x86_64/core2/popcount.asm32
-rw-r--r--gmp/mpn/x86_64/core2/redc_1.asm425
-rw-r--r--gmp/mpn/x86_64/core2/rsh1aors_n.asm169
-rw-r--r--gmp/mpn/x86_64/core2/rshift.asm100
-rw-r--r--gmp/mpn/x86_64/core2/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/core2/sqr_basecase.asm984
-rw-r--r--gmp/mpn/x86_64/core2/sublsh1_n.asm47
-rw-r--r--gmp/mpn/x86_64/core2/sublsh2_n.asm47
-rw-r--r--gmp/mpn/x86_64/coreihwl/addmul_2.asm238
-rw-r--r--gmp/mpn/x86_64/coreihwl/aorsmul_1.asm198
-rw-r--r--gmp/mpn/x86_64/coreihwl/gmp-mparam.h237
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_1.asm155
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_2.asm173
-rw-r--r--gmp/mpn/x86_64/coreihwl/mul_basecase.asm441
-rw-r--r--gmp/mpn/x86_64/coreihwl/mullo_basecase.asm426
-rw-r--r--gmp/mpn/x86_64/coreihwl/redc_1.asm433
-rw-r--r--gmp/mpn/x86_64/coreihwl/sqr_basecase.asm506
-rw-r--r--gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm200
-rw-r--r--gmp/mpn/x86_64/coreinhm/aorsmul_1.asm187
-rw-r--r--gmp/mpn/x86_64/coreinhm/gmp-mparam.h231
-rw-r--r--gmp/mpn/x86_64/coreinhm/hamdist.asm38
-rw-r--r--gmp/mpn/x86_64/coreinhm/popcount.asm38
-rw-r--r--gmp/mpn/x86_64/coreinhm/redc_1.asm544
-rw-r--r--gmp/mpn/x86_64/coreinhm/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/addmul_2.asm224
-rw-r--r--gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm54
-rw-r--r--gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm56
-rw-r--r--gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm173
-rw-r--r--gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm215
-rw-r--r--gmp/mpn/x86_64/coreisbr/aors_n.asm198
-rw-r--r--gmp/mpn/x86_64/coreisbr/aorsmul_1.asm209
-rw-r--r--gmp/mpn/x86_64/coreisbr/divrem_1.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/gmp-mparam.h224
-rw-r--r--gmp/mpn/x86_64/coreisbr/lshift.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/lshiftc.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/mul_1.asm161
-rw-r--r--gmp/mpn/x86_64/coreisbr/mul_2.asm163
-rw-r--r--gmp/mpn/x86_64/coreisbr/mul_basecase.asm407
-rw-r--r--gmp/mpn/x86_64/coreisbr/mullo_basecase.asm384
-rw-r--r--gmp/mpn/x86_64/coreisbr/popcount.asm118
-rw-r--r--gmp/mpn/x86_64/coreisbr/redc_1.asm541
-rw-r--r--gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm193
-rw-r--r--gmp/mpn/x86_64/coreisbr/rshift.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/coreisbr/sqr_basecase.asm484
-rw-r--r--gmp/mpn/x86_64/darwin.m475
-rw-r--r--gmp/mpn/x86_64/div_qr_1n_pi1.asm247
-rw-r--r--gmp/mpn/x86_64/div_qr_2n_pi1.asm158
-rw-r--r--gmp/mpn/x86_64/div_qr_2u_pi1.asm200
-rw-r--r--gmp/mpn/x86_64/dive_1.asm197
-rw-r--r--gmp/mpn/x86_64/divrem_1.asm233
-rw-r--r--gmp/mpn/x86_64/divrem_2.asm288
-rw-r--r--gmp/mpn/x86_64/dos64.m4100
-rw-r--r--gmp/mpn/x86_64/fastavx/copyd.asm171
-rw-r--r--gmp/mpn/x86_64/fastavx/copyi.asm168
-rw-r--r--gmp/mpn/x86_64/fastsse/README21
-rw-r--r--gmp/mpn/x86_64/fastsse/com-palignr.asm302
-rw-r--r--gmp/mpn/x86_64/fastsse/com.asm161
-rw-r--r--gmp/mpn/x86_64/fastsse/copyd-palignr.asm251
-rw-r--r--gmp/mpn/x86_64/fastsse/copyd.asm145
-rw-r--r--gmp/mpn/x86_64/fastsse/copyi-palignr.asm295
-rw-r--r--gmp/mpn/x86_64/fastsse/copyi.asm166
-rw-r--r--gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm182
-rw-r--r--gmp/mpn/x86_64/fastsse/lshift.asm169
-rw-r--r--gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm193
-rw-r--r--gmp/mpn/x86_64/fastsse/lshiftc.asm179
-rw-r--r--gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm201
-rw-r--r--gmp/mpn/x86_64/fastsse/sec_tabselect.asm192
-rw-r--r--gmp/mpn/x86_64/fat/fat.c368
-rw-r--r--gmp/mpn/x86_64/fat/fat_entry.asm204
-rw-r--r--gmp/mpn/x86_64/fat/gmp-mparam.h72
-rw-r--r--gmp/mpn/x86_64/fat/mod_1.c32
-rw-r--r--gmp/mpn/x86_64/fat/mul_basecase.c32
-rw-r--r--gmp/mpn/x86_64/fat/mullo_basecase.c32
-rw-r--r--gmp/mpn/x86_64/fat/redc_1.c32
-rw-r--r--gmp/mpn/x86_64/fat/redc_2.c32
-rw-r--r--gmp/mpn/x86_64/fat/sqr_basecase.c32
-rw-r--r--gmp/mpn/x86_64/gcd_1.asm163
-rw-r--r--gmp/mpn/x86_64/gmp-mparam.h275
-rw-r--r--gmp/mpn/x86_64/invert_limb.asm178
-rw-r--r--gmp/mpn/x86_64/invert_limb_table.asm50
-rw-r--r--gmp/mpn/x86_64/k10/gcd_1.asm37
-rw-r--r--gmp/mpn/x86_64/k10/gmp-mparam.h222
-rw-r--r--gmp/mpn/x86_64/k10/hamdist.asm103
-rw-r--r--gmp/mpn/x86_64/k10/lshift.asm37
-rw-r--r--gmp/mpn/x86_64/k10/lshiftc.asm37
-rw-r--r--gmp/mpn/x86_64/k10/popcount.asm138
-rw-r--r--gmp/mpn/x86_64/k10/rshift.asm37
-rw-r--r--gmp/mpn/x86_64/k10/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/k8/aorrlsh_n.asm217
-rw-r--r--gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm249
-rw-r--r--gmp/mpn/x86_64/k8/gmp-mparam.h236
-rw-r--r--gmp/mpn/x86_64/k8/mullo_basecase.asm436
-rw-r--r--gmp/mpn/x86_64/k8/mulmid_basecase.asm559
-rw-r--r--gmp/mpn/x86_64/k8/redc_1.asm590
-rw-r--r--gmp/mpn/x86_64/logops_n.asm77
-rw-r--r--gmp/mpn/x86_64/lshift.asm153
-rw-r--r--gmp/mpn/x86_64/lshiftc.asm182
-rw-r--r--gmp/mpn/x86_64/lshsub_n.asm106
-rw-r--r--gmp/mpn/x86_64/missing-call.m453
-rw-r--r--gmp/mpn/x86_64/missing-inline.m4100
-rw-r--r--gmp/mpn/x86_64/missing.asm130
-rw-r--r--gmp/mpn/x86_64/mod_1_1.asm235
-rw-r--r--gmp/mpn/x86_64/mod_1_2.asm238
-rw-r--r--gmp/mpn/x86_64/mod_1_4.asm269
-rw-r--r--gmp/mpn/x86_64/mod_34lsub1.asm230
-rw-r--r--gmp/mpn/x86_64/mode1o.asm193
-rw-r--r--gmp/mpn/x86_64/mul_1.asm105
-rw-r--r--gmp/mpn/x86_64/mul_2.asm49
-rw-r--r--gmp/mpn/x86_64/mul_basecase.asm (renamed from gmp/mpn/x86_64/k8/mul_basecase.asm)68
-rw-r--r--gmp/mpn/x86_64/mulx/adx/addmul_1.asm149
-rw-r--r--gmp/mpn/x86_64/mulx/aorsmul_1.asm161
-rw-r--r--gmp/mpn/x86_64/mulx/mul_1.asm154
-rw-r--r--gmp/mpn/x86_64/nano/copyd.asm37
-rw-r--r--gmp/mpn/x86_64/nano/copyi.asm37
-rw-r--r--gmp/mpn/x86_64/nano/dive_1.asm166
-rw-r--r--gmp/mpn/x86_64/nano/gcd_1.asm37
-rw-r--r--gmp/mpn/x86_64/nano/gmp-mparam.h243
-rw-r--r--gmp/mpn/x86_64/nano/popcount.asm35
-rw-r--r--gmp/mpn/x86_64/pentium4/aors_n.asm64
-rw-r--r--gmp/mpn/x86_64/pentium4/aorslsh1_n.asm212
-rw-r--r--gmp/mpn/x86_64/pentium4/aorslsh2_n.asm50
-rw-r--r--gmp/mpn/x86_64/pentium4/aorslshC_n.asm203
-rw-r--r--gmp/mpn/x86_64/pentium4/gmp-mparam.h282
-rw-r--r--gmp/mpn/x86_64/pentium4/lshift.asm78
-rw-r--r--gmp/mpn/x86_64/pentium4/lshiftc.asm179
-rw-r--r--gmp/mpn/x86_64/pentium4/mod_34lsub1.asm167
-rw-r--r--gmp/mpn/x86_64/pentium4/popcount.asm32
-rw-r--r--gmp/mpn/x86_64/pentium4/rsh1aors_n.asm334
-rw-r--r--gmp/mpn/x86_64/pentium4/rshift.asm78
-rw-r--r--gmp/mpn/x86_64/pentium4/sec_tabselect.asm37
-rw-r--r--gmp/mpn/x86_64/popham.asm208
-rw-r--r--gmp/mpn/x86_64/redc_1.asm335
-rw-r--r--gmp/mpn/x86_64/rsh1add_n.asm146
-rw-r--r--gmp/mpn/x86_64/rsh1aors_n.asm189
-rw-r--r--gmp/mpn/x86_64/rsh1sub_n.asm146
-rw-r--r--gmp/mpn/x86_64/rshift.asm128
-rw-r--r--gmp/mpn/x86_64/sec_tabselect.asm176
-rw-r--r--gmp/mpn/x86_64/sqr_basecase.asm (renamed from gmp/mpn/x86_64/k8/sqr_basecase.asm)442
-rw-r--r--gmp/mpn/x86_64/sqr_diag_addlsh1.asm116
-rw-r--r--gmp/mpn/x86_64/sublsh1_n.asm54
-rw-r--r--gmp/mpn/x86_64/x86_64-defs.m4245
-rw-r--r--gmp/mpn/z8000/README45
-rw-r--r--gmp/mpn/z8000/add_n.s51
-rw-r--r--gmp/mpn/z8000/gmp-mparam.h21
-rw-r--r--gmp/mpn/z8000/mul_1.s66
-rw-r--r--gmp/mpn/z8000/sub_n.s52
-rw-r--r--gmp/mpn/z8000x/add_n.s54
-rw-r--r--gmp/mpn/z8000x/sub_n.s54
1227 files changed, 30206 insertions, 116133 deletions
diff --git a/gmp/mpn/Makeasm.am b/gmp/mpn/Makeasm.am
index 5d7306c221..bb66700384 100644
--- a/gmp/mpn/Makeasm.am
+++ b/gmp/mpn/Makeasm.am
@@ -1,32 +1,22 @@
## Automake asm file rules.
-# Copyright 1996, 1998-2002 Free Software Foundation, Inc.
+# Copyright 1996, 1998, 1999, 2000, 2001, 2002 Free Software Foundation,
+# Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# COMPILE minus CC.
diff --git a/gmp/mpn/Makefile.am b/gmp/mpn/Makefile.am
index 20b8a4a116..073b89e988 100644
--- a/gmp/mpn/Makefile.am
+++ b/gmp/mpn/Makefile.am
@@ -1,32 +1,22 @@
## Process this file with automake to generate Makefile.in
-# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc.
+# Copyright 1996, 1998, 1999, 2000, 2001, 2002, 2005 Free Software Foundation,
+# Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
INCLUDES = -D__GMP_WITHIN_GMP -I$(top_srcdir) \
@@ -34,14 +24,42 @@ INCLUDES = -D__GMP_WITHIN_GMP -I$(top_srcdir) \
OFILES = @mpn_objects@
+
+# All possible mpn normal and optional function files are listed here, to
+# get automake to generate ansi2knr rules for each. Such rules will be
+# ignored for any that are instead implemented with a .asm (or whatever) for
+# a particular target.
+#
+nodist_EXTRA_libmpn_la_SOURCES = \
+ add.c add_1.c add_n.c \
+ addmul_1.c addmul_2.c addmul_3.c addmul_4.c addmul_5.c addmul_6.c \
+ addmul_7.c addmul_8.c \
+ and_n.c andn_n.c bdivmod.c \
+ cmp.c com_n.c copyd.c copyi.c \
+ dc_divrem_n.c dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c \
+ dump.c fib2_ui.c gcd.c \
+ gcd_1.c gcdext.c get_d.c get_str.c \
+ hamdist.c hgcd2.c hgcd.c invert_limb.c \
+ ior_n.c iorn_n.c jacbase.c lshift.c \
+ matrix22_mul.c mod_1.c mod_34lsub1.c mode1o.c \
+ mod_1_1.c mod_1_2.c mod_1_3.c mod_1_4.c \
+ mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c \
+ mul_toom22.c mul_toom32.c mul_toom42.c \
+ mullow_n.c mullow_basecase.c nand_n.c neg_n.c nior_n.c perfsqr.c \
+ popcount.c pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
+ rootrem.c sb_divrem_mn.c scan0.c scan1.c set_str.c \
+ sqr_basecase.c sqr_diagonal.c \
+ sqrtrem.c sub.c sub_1.c sub_n.c submul_1.c \
+ tdiv_qr.c udiv_qrnnd.c udiv_w_sdiv.c xor_n.c xnor_n.c
+
noinst_LTLIBRARIES = libmpn.la
nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c
libmpn_la_LIBADD = $(OFILES)
libmpn_la_DEPENDENCIES = $(OFILES)
-TARG_DIST = alpha arm arm64 cray generic ia64 lisp m68k m88k \
- minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
- s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
+TARG_DIST = a29k alpha arm clipper cray generic i960 ia64 lisp m68k m88k \
+ minithres mips32 mips64 ns32k pa32 pa64 power powerpc32 powerpc64 pyr s390 \
+ sh sparc32 sparc64 thumb vax x86 x86_64 z8000 z8000x
EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
@@ -56,4 +74,7 @@ mp_bases.c:
perfsqr.h:
cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
+tune-gcd-p: gcd.c
+ $(COMPILE) -g -O1 -I $(top_srcdir)/tune -DTUNE_GCD_P=1 gcd.c -o tune-gcd-p -L ../.libs -L../tune/.libs -lspeed -lgmp -lm
+
include Makeasm.am
diff --git a/gmp/mpn/Makefile.in b/gmp/mpn/Makefile.in
index 099abf26ab..e9817864fc 100644
--- a/gmp/mpn/Makefile.in
+++ b/gmp/mpn/Makefile.in
@@ -1,9 +1,8 @@
-# Makefile.in generated by automake 1.11.6 from Makefile.am.
+# Makefile.in generated by automake 1.8.4 from Makefile.am.
# @configure_input@
# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
+# 2003, 2004 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
@@ -15,85 +14,53 @@
@SET_MAKE@
-# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc.
+# Copyright 1996, 1998, 1999, 2000, 2001, 2002, 2005 Free Software Foundation,
+# Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-# Copyright 1996, 1998-2002 Free Software Foundation, Inc.
-#
-# This file is part of the GNU MP Library.
-#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# Copyright 1996, 1998, 1999, 2000, 2001, 2002 Free Software Foundation,
+# Inc.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# This file is part of the GNU MP Library.
#
-# or
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+SOURCES = $(nodist_libmpn_la_SOURCES) $(nodist_EXTRA_libmpn_la_SOURCES)
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
VPATH = @srcdir@
-am__make_dryrun = \
- { \
- am__dry=no; \
- case $$MAKEFLAGS in \
- *\\[\ \ ]*) \
- echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \
- | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
- *) \
- for am__flg in $$MAKEFLAGS; do \
- case $$am__flg in \
- *=*|--*) ;; \
- *n*) am__dry=yes; break;; \
- esac; \
- done;; \
- esac; \
- test $$am__dry = yes; \
- }
pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+top_builddir = ..
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+INSTALL = @INSTALL@
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
@@ -105,43 +72,37 @@ POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
-build_triplet = @build@
host_triplet = @host@
+ANSI2KNR = $(top_builddir)/ansi2knr
DIST_COMMON = README $(srcdir)/Makeasm.am $(srcdir)/Makefile.am \
$(srcdir)/Makefile.in
subdir = mpn
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
- $(top_srcdir)/configure.ac
+ $(top_srcdir)/configure.in
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
+mkinstalldirs = $(mkdir_p)
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
am__DEPENDENCIES_1 =
-nodist_libmpn_la_OBJECTS = fib_table.lo mp_bases.lo
+nodist_libmpn_la_OBJECTS = fib_table$U.lo mp_bases$U.lo
libmpn_la_OBJECTS = $(nodist_libmpn_la_OBJECTS)
-DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
depcomp =
am__depfiles_maybe =
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
- --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
- $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CFLAGS) $(CFLAGS)
CCLD = $(CC)
-LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
- --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
- $(LDFLAGS) -o $@
-SOURCES = $(nodist_libmpn_la_SOURCES)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(AM_LDFLAGS) $(LDFLAGS) -o $@
+SOURCES = $(nodist_libmpn_la_SOURCES) \
+ $(nodist_EXTRA_libmpn_la_SOURCES)
DIST_SOURCES =
-am__can_run_installinfo = \
- case $$AM_UPDATE_INFO_DIR in \
- n|no|NO) false;; \
- *) (install-info --version) >/dev/null 2>&1;; \
- esac
ETAGS = etags
CTAGS = ctags
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
@@ -155,6 +116,7 @@ AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AWK = @AWK@
+BITS_PER_MP_LIMB = @BITS_PER_MP_LIMB@
CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
CC = @CC@
CCAS = @CCAS@
@@ -170,17 +132,16 @@ CYGPATH_W = @CYGPATH_W@
DEFN_LONG_LONG_LIMB = @DEFN_LONG_LONG_LIMB@
DEFS = @DEFS@
DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
+ECHO = @ECHO@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
+ENABLE_STATIC_FALSE = @ENABLE_STATIC_FALSE@
+ENABLE_STATIC_TRUE = @ENABLE_STATIC_TRUE@
EXEEXT = @EXEEXT@
EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@
-FGREP = @FGREP@
GMP_LDFLAGS = @GMP_LDFLAGS@
-GMP_LIMB_BITS = @GMP_LIMB_BITS@
GMP_NAIL_BITS = @GMP_NAIL_BITS@
GREP = @GREP@
HAVE_CLOCK_01 = @HAVE_CLOCK_01@
@@ -194,12 +155,10 @@ HAVE_SIGALTSTACK_01 = @HAVE_SIGALTSTACK_01@
HAVE_SIGSTACK_01 = @HAVE_SIGSTACK_01@
HAVE_STACK_T_01 = @HAVE_STACK_T_01@
HAVE_SYS_RESOURCE_H_01 = @HAVE_SYS_RESOURCE_H_01@
-INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-LD = @LD@
LDFLAGS = @LDFLAGS@
LEX = @LEX@
LEXLIB = @LEXLIB@
@@ -214,26 +173,20 @@ LIBOBJS = @LIBOBJS@
LIBREADLINE = @LIBREADLINE@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
-LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
M4 = @M4@
MAINT = @MAINT@
+MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
+MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
RANLIB = @RANLIB@
@@ -243,31 +196,26 @@ SHELL = @SHELL@
SPEED_CYCLECOUNTER_OBJ = @SPEED_CYCLECOUNTER_OBJ@
STRIP = @STRIP@
TAL_OBJECT = @TAL_OBJECT@
-TUNE_LIBS = @TUNE_LIBS@
TUNE_SQR_OBJ = @TUNE_SQR_OBJ@
+U = @U@
U_FOR_BUILD = @U_FOR_BUILD@
VERSION = @VERSION@
+WANT_CXX_FALSE = @WANT_CXX_FALSE@
+WANT_CXX_TRUE = @WANT_CXX_TRUE@
+WANT_MPBSD_FALSE = @WANT_MPBSD_FALSE@
+WANT_MPBSD_TRUE = @WANT_MPBSD_TRUE@
WITH_READLINE_01 = @WITH_READLINE_01@
YACC = @YACC@
YFLAGS = @YFLAGS@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
am__leading_dot = @am__leading_dot@
-am__tar = @am__tar@
-am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
-builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
@@ -291,6 +239,7 @@ mandir = @mandir@
mkdir_p = @mkdir_p@
mpn_objects = @mpn_objects@
mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+mpn_objs_in_libmp = @mpn_objs_in_libmp@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
@@ -298,23 +247,47 @@ program_transform_name = @program_transform_name@
psdir = @psdir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
INCLUDES = -D__GMP_WITHIN_GMP -I$(top_srcdir) \
-DOPERATION_`echo $* | sed 's/_$$//'`
OFILES = @mpn_objects@
+
+# All possible mpn normal and optional function files are listed here, to
+# get automake to generate ansi2knr rules for each. Such rules will be
+# ignored for any that are instead implemented with a .asm (or whatever) for
+# a particular target.
+#
+nodist_EXTRA_libmpn_la_SOURCES = \
+ add.c add_1.c add_n.c \
+ addmul_1.c addmul_2.c addmul_3.c addmul_4.c addmul_5.c addmul_6.c \
+ addmul_7.c addmul_8.c \
+ and_n.c andn_n.c bdivmod.c \
+ cmp.c com_n.c copyd.c copyi.c \
+ dc_divrem_n.c dive_1.c diveby3.c divis.c divrem.c divrem_1.c divrem_2.c \
+ dump.c fib2_ui.c gcd.c \
+ gcd_1.c gcdext.c get_d.c get_str.c \
+ hamdist.c hgcd2.c hgcd.c invert_limb.c \
+ ior_n.c iorn_n.c jacbase.c lshift.c \
+ matrix22_mul.c mod_1.c mod_34lsub1.c mode1o.c \
+ mod_1_1.c mod_1_2.c mod_1_3.c mod_1_4.c \
+ mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c \
+ mul_toom22.c mul_toom32.c mul_toom42.c \
+ mullow_n.c mullow_basecase.c nand_n.c neg_n.c nior_n.c perfsqr.c \
+ popcount.c pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
+ rootrem.c sb_divrem_mn.c scan0.c scan1.c set_str.c \
+ sqr_basecase.c sqr_diagonal.c \
+ sqrtrem.c sub.c sub_1.c sub_n.c submul_1.c \
+ tdiv_qr.c udiv_qrnnd.c udiv_w_sdiv.c xor_n.c xnor_n.c
+
noinst_LTLIBRARIES = libmpn.la
nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c
libmpn_la_LIBADD = $(OFILES)
libmpn_la_DEPENDENCIES = $(OFILES)
-TARG_DIST = alpha arm arm64 cray generic ia64 lisp m68k m88k \
- minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
- s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
+TARG_DIST = a29k alpha arm clipper cray generic i960 ia64 lisp m68k m88k \
+ minithres mips32 mips64 ns32k pa32 pa64 power powerpc32 powerpc64 pyr s390 \
+ sh sparc32 sparc64 thumb vax x86 x86_64 z8000 z8000x
EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
@@ -354,14 +327,14 @@ $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/Ma
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
- ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
- && { if test -f $@; then exit 0; else break; fi; }; \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \
+ && exit 0; \
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile'; \
- $(am__cd) $(top_srcdir) && \
- $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile'; \
+ cd $(top_srcdir) && \
+ $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
@@ -371,7 +344,6 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
esac;
-$(srcdir)/Makeasm.am:
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
@@ -380,24 +352,28 @@ $(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; for p in $$list; do \
dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
- test "$$dir" != "$$p" || dir=.; \
+ test "$$dir" = "$$p" && dir=.; \
echo "rm -f \"$${dir}/so_locations\""; \
rm -f "$${dir}/so_locations"; \
done
-libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES) $(EXTRA_libmpn_la_DEPENDENCIES)
- $(LINK) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES)
+ $(LINK) $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
+$(top_builddir)/ansi2knr:
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) ansi2knr
+
+mostlyclean-kr:
+ -test "$U" = "" || rm -f *_.c
.c.o:
$(COMPILE) -c $<
@@ -407,6 +383,233 @@ distclean-compile:
.c.lo:
$(LTCOMPILE) -c -o $@ $<
+add_.c: add.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/add.c; then echo $(srcdir)/add.c; else echo add.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+add_1_.c: add_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/add_1.c; then echo $(srcdir)/add_1.c; else echo add_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+add_n_.c: add_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/add_n.c; then echo $(srcdir)/add_n.c; else echo add_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_1_.c: addmul_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_1.c; then echo $(srcdir)/addmul_1.c; else echo addmul_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_2_.c: addmul_2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_2.c; then echo $(srcdir)/addmul_2.c; else echo addmul_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_3_.c: addmul_3.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_3.c; then echo $(srcdir)/addmul_3.c; else echo addmul_3.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_4_.c: addmul_4.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_4.c; then echo $(srcdir)/addmul_4.c; else echo addmul_4.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_5_.c: addmul_5.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_5.c; then echo $(srcdir)/addmul_5.c; else echo addmul_5.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_6_.c: addmul_6.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_6.c; then echo $(srcdir)/addmul_6.c; else echo addmul_6.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_7_.c: addmul_7.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_7.c; then echo $(srcdir)/addmul_7.c; else echo addmul_7.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+addmul_8_.c: addmul_8.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/addmul_8.c; then echo $(srcdir)/addmul_8.c; else echo addmul_8.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+and_n_.c: and_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/and_n.c; then echo $(srcdir)/and_n.c; else echo and_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+andn_n_.c: andn_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/andn_n.c; then echo $(srcdir)/andn_n.c; else echo andn_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+bdivmod_.c: bdivmod.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/bdivmod.c; then echo $(srcdir)/bdivmod.c; else echo bdivmod.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+cmp_.c: cmp.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/cmp.c; then echo $(srcdir)/cmp.c; else echo cmp.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+com_n_.c: com_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/com_n.c; then echo $(srcdir)/com_n.c; else echo com_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+copyd_.c: copyd.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/copyd.c; then echo $(srcdir)/copyd.c; else echo copyd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+copyi_.c: copyi.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/copyi.c; then echo $(srcdir)/copyi.c; else echo copyi.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+dc_divrem_n_.c: dc_divrem_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/dc_divrem_n.c; then echo $(srcdir)/dc_divrem_n.c; else echo dc_divrem_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+dive_1_.c: dive_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/dive_1.c; then echo $(srcdir)/dive_1.c; else echo dive_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+diveby3_.c: diveby3.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/diveby3.c; then echo $(srcdir)/diveby3.c; else echo diveby3.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+divis_.c: divis.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divis.c; then echo $(srcdir)/divis.c; else echo divis.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+divrem_.c: divrem.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem.c; then echo $(srcdir)/divrem.c; else echo divrem.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+divrem_1_.c: divrem_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_1.c; then echo $(srcdir)/divrem_1.c; else echo divrem_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+divrem_2_.c: divrem_2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_2.c; then echo $(srcdir)/divrem_2.c; else echo divrem_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+dump_.c: dump.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/dump.c; then echo $(srcdir)/dump.c; else echo dump.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+fib2_ui_.c: fib2_ui.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fib2_ui.c; then echo $(srcdir)/fib2_ui.c; else echo fib2_ui.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+fib_table_.c: fib_table.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fib_table.c; then echo $(srcdir)/fib_table.c; else echo fib_table.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+gcd_.c: gcd.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/gcd.c; then echo $(srcdir)/gcd.c; else echo gcd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+gcd_1_.c: gcd_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/gcd_1.c; then echo $(srcdir)/gcd_1.c; else echo gcd_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+gcdext_.c: gcdext.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/gcdext.c; then echo $(srcdir)/gcdext.c; else echo gcdext.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+get_d_.c: get_d.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/get_d.c; then echo $(srcdir)/get_d.c; else echo get_d.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+get_str_.c: get_str.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/get_str.c; then echo $(srcdir)/get_str.c; else echo get_str.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+hamdist_.c: hamdist.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/hamdist.c; then echo $(srcdir)/hamdist.c; else echo hamdist.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+hgcd_.c: hgcd.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/hgcd.c; then echo $(srcdir)/hgcd.c; else echo hgcd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+hgcd2_.c: hgcd2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/hgcd2.c; then echo $(srcdir)/hgcd2.c; else echo hgcd2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+invert_limb_.c: invert_limb.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/invert_limb.c; then echo $(srcdir)/invert_limb.c; else echo invert_limb.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+ior_n_.c: ior_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/ior_n.c; then echo $(srcdir)/ior_n.c; else echo ior_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+iorn_n_.c: iorn_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/iorn_n.c; then echo $(srcdir)/iorn_n.c; else echo iorn_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+jacbase_.c: jacbase.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/jacbase.c; then echo $(srcdir)/jacbase.c; else echo jacbase.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+lshift_.c: lshift.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/lshift.c; then echo $(srcdir)/lshift.c; else echo lshift.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+matrix22_mul_.c: matrix22_mul.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/matrix22_mul.c; then echo $(srcdir)/matrix22_mul.c; else echo matrix22_mul.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_1_.c: mod_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_1.c; then echo $(srcdir)/mod_1.c; else echo mod_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_1_1_.c: mod_1_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_1_1.c; then echo $(srcdir)/mod_1_1.c; else echo mod_1_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_1_2_.c: mod_1_2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_1_2.c; then echo $(srcdir)/mod_1_2.c; else echo mod_1_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_1_3_.c: mod_1_3.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_1_3.c; then echo $(srcdir)/mod_1_3.c; else echo mod_1_3.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_1_4_.c: mod_1_4.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_1_4.c; then echo $(srcdir)/mod_1_4.c; else echo mod_1_4.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mod_34lsub1_.c: mod_34lsub1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mod_34lsub1.c; then echo $(srcdir)/mod_34lsub1.c; else echo mod_34lsub1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mode1o_.c: mode1o.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mode1o.c; then echo $(srcdir)/mode1o.c; else echo mode1o.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mp_bases_.c: mp_bases.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_bases.c; then echo $(srcdir)/mp_bases.c; else echo mp_bases.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_.c: mul.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul.c; then echo $(srcdir)/mul.c; else echo mul.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_1_.c: mul_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_1.c; then echo $(srcdir)/mul_1.c; else echo mul_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_2_.c: mul_2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_2.c; then echo $(srcdir)/mul_2.c; else echo mul_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_3_.c: mul_3.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_3.c; then echo $(srcdir)/mul_3.c; else echo mul_3.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_4_.c: mul_4.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_4.c; then echo $(srcdir)/mul_4.c; else echo mul_4.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_basecase_.c: mul_basecase.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_basecase.c; then echo $(srcdir)/mul_basecase.c; else echo mul_basecase.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_fft_.c: mul_fft.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_fft.c; then echo $(srcdir)/mul_fft.c; else echo mul_fft.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_n_.c: mul_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_n.c; then echo $(srcdir)/mul_n.c; else echo mul_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_toom22_.c: mul_toom22.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_toom22.c; then echo $(srcdir)/mul_toom22.c; else echo mul_toom22.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_toom32_.c: mul_toom32.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_toom32.c; then echo $(srcdir)/mul_toom32.c; else echo mul_toom32.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mul_toom42_.c: mul_toom42.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mul_toom42.c; then echo $(srcdir)/mul_toom42.c; else echo mul_toom42.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mullow_basecase_.c: mullow_basecase.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mullow_basecase.c; then echo $(srcdir)/mullow_basecase.c; else echo mullow_basecase.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+mullow_n_.c: mullow_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mullow_n.c; then echo $(srcdir)/mullow_n.c; else echo mullow_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+nand_n_.c: nand_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/nand_n.c; then echo $(srcdir)/nand_n.c; else echo nand_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+neg_n_.c: neg_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/neg_n.c; then echo $(srcdir)/neg_n.c; else echo neg_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+nior_n_.c: nior_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/nior_n.c; then echo $(srcdir)/nior_n.c; else echo nior_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+perfsqr_.c: perfsqr.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/perfsqr.c; then echo $(srcdir)/perfsqr.c; else echo perfsqr.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+popcount_.c: popcount.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/popcount.c; then echo $(srcdir)/popcount.c; else echo popcount.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+pow_1_.c: pow_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/pow_1.c; then echo $(srcdir)/pow_1.c; else echo pow_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+pre_divrem_1_.c: pre_divrem_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/pre_divrem_1.c; then echo $(srcdir)/pre_divrem_1.c; else echo pre_divrem_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+pre_mod_1_.c: pre_mod_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/pre_mod_1.c; then echo $(srcdir)/pre_mod_1.c; else echo pre_mod_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+random_.c: random.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/random.c; then echo $(srcdir)/random.c; else echo random.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+random2_.c: random2.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/random2.c; then echo $(srcdir)/random2.c; else echo random2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+rootrem_.c: rootrem.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/rootrem.c; then echo $(srcdir)/rootrem.c; else echo rootrem.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+rshift_.c: rshift.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/rshift.c; then echo $(srcdir)/rshift.c; else echo rshift.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sb_divrem_mn_.c: sb_divrem_mn.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sb_divrem_mn.c; then echo $(srcdir)/sb_divrem_mn.c; else echo sb_divrem_mn.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+scan0_.c: scan0.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/scan0.c; then echo $(srcdir)/scan0.c; else echo scan0.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+scan1_.c: scan1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/scan1.c; then echo $(srcdir)/scan1.c; else echo scan1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+set_str_.c: set_str.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/set_str.c; then echo $(srcdir)/set_str.c; else echo set_str.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sqr_basecase_.c: sqr_basecase.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sqr_basecase.c; then echo $(srcdir)/sqr_basecase.c; else echo sqr_basecase.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sqr_diagonal_.c: sqr_diagonal.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sqr_diagonal.c; then echo $(srcdir)/sqr_diagonal.c; else echo sqr_diagonal.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sqrtrem_.c: sqrtrem.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sqrtrem.c; then echo $(srcdir)/sqrtrem.c; else echo sqrtrem.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sub_.c: sub.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sub.c; then echo $(srcdir)/sub.c; else echo sub.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sub_1_.c: sub_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sub_1.c; then echo $(srcdir)/sub_1.c; else echo sub_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+sub_n_.c: sub_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/sub_n.c; then echo $(srcdir)/sub_n.c; else echo sub_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+submul_1_.c: submul_1.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/submul_1.c; then echo $(srcdir)/submul_1.c; else echo submul_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+tdiv_qr_.c: tdiv_qr.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/tdiv_qr.c; then echo $(srcdir)/tdiv_qr.c; else echo tdiv_qr.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+udiv_qrnnd_.c: udiv_qrnnd.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/udiv_qrnnd.c; then echo $(srcdir)/udiv_qrnnd.c; else echo udiv_qrnnd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+udiv_w_sdiv_.c: udiv_w_sdiv.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/udiv_w_sdiv.c; then echo $(srcdir)/udiv_w_sdiv.c; else echo udiv_w_sdiv.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+xnor_n_.c: xnor_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/xnor_n.c; then echo $(srcdir)/xnor_n.c; else echo xnor_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+xor_n_.c: xor_n.c $(ANSI2KNR)
+ $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/xor_n.c; then echo $(srcdir)/xor_n.c; else echo xor_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
+add_.$(OBJEXT) add_.lo add_1_.$(OBJEXT) add_1_.lo add_n_.$(OBJEXT) \
+add_n_.lo addmul_1_.$(OBJEXT) addmul_1_.lo addmul_2_.$(OBJEXT) \
+addmul_2_.lo addmul_3_.$(OBJEXT) addmul_3_.lo addmul_4_.$(OBJEXT) \
+addmul_4_.lo addmul_5_.$(OBJEXT) addmul_5_.lo addmul_6_.$(OBJEXT) \
+addmul_6_.lo addmul_7_.$(OBJEXT) addmul_7_.lo addmul_8_.$(OBJEXT) \
+addmul_8_.lo and_n_.$(OBJEXT) and_n_.lo andn_n_.$(OBJEXT) andn_n_.lo \
+bdivmod_.$(OBJEXT) bdivmod_.lo cmp_.$(OBJEXT) cmp_.lo com_n_.$(OBJEXT) \
+com_n_.lo copyd_.$(OBJEXT) copyd_.lo copyi_.$(OBJEXT) copyi_.lo \
+dc_divrem_n_.$(OBJEXT) dc_divrem_n_.lo dive_1_.$(OBJEXT) dive_1_.lo \
+diveby3_.$(OBJEXT) diveby3_.lo divis_.$(OBJEXT) divis_.lo \
+divrem_.$(OBJEXT) divrem_.lo divrem_1_.$(OBJEXT) divrem_1_.lo \
+divrem_2_.$(OBJEXT) divrem_2_.lo dump_.$(OBJEXT) dump_.lo \
+fib2_ui_.$(OBJEXT) fib2_ui_.lo fib_table_.$(OBJEXT) fib_table_.lo \
+gcd_.$(OBJEXT) gcd_.lo gcd_1_.$(OBJEXT) gcd_1_.lo gcdext_.$(OBJEXT) \
+gcdext_.lo get_d_.$(OBJEXT) get_d_.lo get_str_.$(OBJEXT) get_str_.lo \
+hamdist_.$(OBJEXT) hamdist_.lo hgcd_.$(OBJEXT) hgcd_.lo \
+hgcd2_.$(OBJEXT) hgcd2_.lo invert_limb_.$(OBJEXT) invert_limb_.lo \
+ior_n_.$(OBJEXT) ior_n_.lo iorn_n_.$(OBJEXT) iorn_n_.lo \
+jacbase_.$(OBJEXT) jacbase_.lo lshift_.$(OBJEXT) lshift_.lo \
+matrix22_mul_.$(OBJEXT) matrix22_mul_.lo mod_1_.$(OBJEXT) mod_1_.lo \
+mod_1_1_.$(OBJEXT) mod_1_1_.lo mod_1_2_.$(OBJEXT) mod_1_2_.lo \
+mod_1_3_.$(OBJEXT) mod_1_3_.lo mod_1_4_.$(OBJEXT) mod_1_4_.lo \
+mod_34lsub1_.$(OBJEXT) mod_34lsub1_.lo mode1o_.$(OBJEXT) mode1o_.lo \
+mp_bases_.$(OBJEXT) mp_bases_.lo mul_.$(OBJEXT) mul_.lo \
+mul_1_.$(OBJEXT) mul_1_.lo mul_2_.$(OBJEXT) mul_2_.lo mul_3_.$(OBJEXT) \
+mul_3_.lo mul_4_.$(OBJEXT) mul_4_.lo mul_basecase_.$(OBJEXT) \
+mul_basecase_.lo mul_fft_.$(OBJEXT) mul_fft_.lo mul_n_.$(OBJEXT) \
+mul_n_.lo mul_toom22_.$(OBJEXT) mul_toom22_.lo mul_toom32_.$(OBJEXT) \
+mul_toom32_.lo mul_toom42_.$(OBJEXT) mul_toom42_.lo \
+mullow_basecase_.$(OBJEXT) mullow_basecase_.lo mullow_n_.$(OBJEXT) \
+mullow_n_.lo nand_n_.$(OBJEXT) nand_n_.lo neg_n_.$(OBJEXT) neg_n_.lo \
+nior_n_.$(OBJEXT) nior_n_.lo perfsqr_.$(OBJEXT) perfsqr_.lo \
+popcount_.$(OBJEXT) popcount_.lo pow_1_.$(OBJEXT) pow_1_.lo \
+pre_divrem_1_.$(OBJEXT) pre_divrem_1_.lo pre_mod_1_.$(OBJEXT) \
+pre_mod_1_.lo random_.$(OBJEXT) random_.lo random2_.$(OBJEXT) \
+random2_.lo rootrem_.$(OBJEXT) rootrem_.lo rshift_.$(OBJEXT) \
+rshift_.lo sb_divrem_mn_.$(OBJEXT) sb_divrem_mn_.lo scan0_.$(OBJEXT) \
+scan0_.lo scan1_.$(OBJEXT) scan1_.lo set_str_.$(OBJEXT) set_str_.lo \
+sqr_basecase_.$(OBJEXT) sqr_basecase_.lo sqr_diagonal_.$(OBJEXT) \
+sqr_diagonal_.lo sqrtrem_.$(OBJEXT) sqrtrem_.lo sub_.$(OBJEXT) sub_.lo \
+sub_1_.$(OBJEXT) sub_1_.lo sub_n_.$(OBJEXT) sub_n_.lo \
+submul_1_.$(OBJEXT) submul_1_.lo tdiv_qr_.$(OBJEXT) tdiv_qr_.lo \
+udiv_qrnnd_.$(OBJEXT) udiv_qrnnd_.lo udiv_w_sdiv_.$(OBJEXT) \
+udiv_w_sdiv_.lo xnor_n_.$(OBJEXT) xnor_n_.lo xor_n_.$(OBJEXT) \
+xor_n_.lo : $(ANSI2KNR)
mostlyclean-libtool:
-rm -f *.lo
@@ -414,85 +617,82 @@ mostlyclean-libtool:
clean-libtool:
-rm -rf .libs _libs
+distclean-libtool:
+ -rm -f libtool
+uninstall-info-am:
+
ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | \
- $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
- END { if (nonempty) { for (i in files) print i; }; }'`; \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
mkid -fID $$unique
tags: TAGS
TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
$(TAGS_FILES) $(LISP)
- set x; \
+ tags=; \
here=`pwd`; \
list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | \
- $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
- END { if (nonempty) { for (i in files) print i; }; }'`; \
- shift; \
- if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
- test -n "$$unique" || unique=$$empty_fix; \
- if test $$# -gt 0; then \
- $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
- "$$@" $$unique; \
- else \
- $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
- $$unique; \
- fi; \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
+ test -z "$$unique" && unique=$$empty_fix; \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$tags $$unique; \
fi
ctags: CTAGS
CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
$(TAGS_FILES) $(LISP)
+ tags=; \
+ here=`pwd`; \
list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | \
- $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
- END { if (nonempty) { for (i in files) print i; }; }'`; \
- test -z "$(CTAGS_ARGS)$$unique" \
+ $(AWK) ' { files[$$0] = 1; } \
+ END { for (i in files) print i; }'`; \
+ test -z "$(CTAGS_ARGS)$$tags$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
- $$unique
+ $$tags $$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
- && $(am__cd) $(top_srcdir) \
- && gtags -i $(GTAGS_ARGS) "$$here"
+ && cd $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) $$here
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(DISTFILES)
- @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
- topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
- list='$(DISTFILES)'; \
- dist_files=`for file in $$list; do echo $$file; done | \
- sed -e "s|^$$srcdirstrip/||;t" \
- -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
- case $$dist_files in \
- */*) $(MKDIR_P) `echo "$$dist_files" | \
- sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
- sort -u` ;; \
- esac; \
- for file in $$dist_files; do \
+ @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
+ list='$(DISTFILES)'; for file in $$list; do \
+ case $$file in \
+ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
+ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
+ esac; \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test "$$dir" != "$$file" && test "$$dir" != "."; then \
+ dir="/$$dir"; \
+ $(mkdir_p) "$(distdir)$$dir"; \
+ else \
+ dir=''; \
+ fi; \
if test -d $$d/$$file; then \
- dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
- if test -d "$(distdir)/$$file"; then \
- find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
- fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
- cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
- find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
fi; \
- cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
else \
- test -f "$(distdir)/$$file" \
- || cp -p $$d/$$file "$(distdir)/$$file" \
+ test -f $(distdir)/$$file \
+ || cp -p $$d/$$file $(distdir)/$$file \
|| exit 1; \
fi; \
done
@@ -510,22 +710,16 @@ install-am: all-am
installcheck: installcheck-am
install-strip:
- if test -z '$(STRIP)'; then \
- $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
- install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
- install; \
- else \
- $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
- install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
- "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
- fi
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ `test -z '$(STRIP)' || \
+ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
mostlyclean-generic:
clean-generic:
distclean-generic:
- -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
- -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+ -rm -f $(CONFIG_CLEAN_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@@ -538,7 +732,7 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
distclean: distclean-am
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
- distclean-tags
+ distclean-libtool distclean-tags
dvi: dvi-am
@@ -546,38 +740,18 @@ dvi-am:
html: html-am
-html-am:
-
info: info-am
info-am:
install-data-am:
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
install-exec-am:
-install-html: install-html-am
-
-install-html-am:
-
install-info: install-info-am
-install-info-am:
-
install-man:
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
installcheck-am:
maintainer-clean: maintainer-clean-am
@@ -586,7 +760,7 @@ maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-am
-mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+mostlyclean-am: mostlyclean-compile mostlyclean-generic mostlyclean-kr \
mostlyclean-libtool
pdf: pdf-am
@@ -597,22 +771,19 @@ ps: ps-am
ps-am:
-uninstall-am:
-
-.MAKE: install-am install-strip
+uninstall-am: uninstall-info-am
.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
clean-libtool clean-noinstLTLIBRARIES ctags distclean \
distclean-compile distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
- install install-am install-data install-data-am install-dvi \
- install-dvi-am install-exec install-exec-am install-html \
- install-html-am install-info install-info-am install-man \
- install-pdf install-pdf-am install-ps install-ps-am \
+ install install-am install-data install-data-am install-exec \
+ install-exec-am install-info install-info-am install-man \
install-strip installcheck installcheck-am installdirs \
maintainer-clean maintainer-clean-generic mostlyclean \
- mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
- pdf pdf-am ps ps-am tags uninstall uninstall-am
+ mostlyclean-compile mostlyclean-generic mostlyclean-kr \
+ mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \
+ uninstall-am uninstall-info-am
# These are BUILT_SOURCES at the top-level, so normally they're built before
@@ -625,6 +796,9 @@ mp_bases.c:
perfsqr.h:
cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
+tune-gcd-p: gcd.c
+ $(COMPILE) -g -O1 -I $(top_srcdir)/tune -DTUNE_GCD_P=1 gcd.c -o tune-gcd-p -L ../.libs -L../tune/.libs -lspeed -lgmp -lm
+
# .s assembler, no preprocessing.
#
.s.o:
@@ -680,7 +854,6 @@ perfsqr.h:
$(RM_TMP) tmp-$*.s
.asm.lo:
$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
-
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
diff --git a/gmp/mpn/README b/gmp/mpn/README
index bc046be732..32fc007e40 100644
--- a/gmp/mpn/README
+++ b/gmp/mpn/README
@@ -3,28 +3,17 @@ Copyright 1996, 1999 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/a29k/add_n.s b/gmp/mpn/a29k/add_n.s
new file mode 100644
index 0000000000..2d926047fd
--- /dev/null
+++ b/gmp/mpn/a29k/add_n.s
@@ -0,0 +1,118 @@
+; 29000 mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers. gr116 is used for this purpose. Note that
+; gr116==0 means that carry should be set.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_add_n
+ .word 0x60000
+___gmpn_add_n:
+ srl gr117,lr5,3
+ sub gr118,gr117,1
+ jmpt gr118,Ltail
+ constn gr116,-1 ; init cy reg
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr104,lr4
+ add lr4,lr4,32
+
+ subr gr116,gr116,0 ; restore carry
+ addc gr96,gr96,gr104
+ addc gr97,gr97,gr105
+ addc gr98,gr98,gr106
+ addc gr99,gr99,gr107
+ addc gr100,gr100,gr108
+ addc gr101,gr101,gr109
+ addc gr102,gr102,gr110
+ addc gr103,gr103,gr111
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+ and lr5,lr5,(8-1)
+Ltail: sub gr118,lr5,1 ; count for CR
+ jmpt gr118,Lend
+ sub gr117,lr5,2 ; count for jmpfdec
+
+ mtsr cr,gr118
+ loadm 0,0,gr96,lr3
+ mtsr cr,gr118
+ loadm 0,0,gr104,lr4
+
+ subr gr116,gr116,0 ; restore carry
+
+ jmpfdec gr117,L1
+ addc gr96,gr96,gr104
+ jmp Lstore
+ mtsr cr,gr118
+L1: jmpfdec gr117,L2
+ addc gr97,gr97,gr105
+ jmp Lstore
+ mtsr cr,gr118
+L2: jmpfdec gr117,L3
+ addc gr98,gr98,gr106
+ jmp Lstore
+ mtsr cr,gr118
+L3: jmpfdec gr117,L4
+ addc gr99,gr99,gr107
+ jmp Lstore
+ mtsr cr,gr118
+L4: jmpfdec gr117,L5
+ addc gr100,gr100,gr108
+ jmp Lstore
+ mtsr cr,gr118
+L5: jmpfdec gr117,L6
+ addc gr101,gr101,gr109
+ jmp Lstore
+ mtsr cr,gr118
+L6: addc gr102,gr102,gr110
+
+Lstore: storem 0,0,gr96,lr2
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+Lend: jmpi lr0
+ add gr96,gr116,1
diff --git a/gmp/mpn/a29k/addmul_1.s b/gmp/mpn/a29k/addmul_1.s
new file mode 100644
index 0000000000..fcf7fc2f39
--- /dev/null
+++ b/gmp/mpn/a29k/addmul_1.s
@@ -0,0 +1,111 @@
+; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; add the product to a second limb vector.
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_addmul_1
+ .word 0x60000
+___gmpn_addmul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr2
+
+ add gr104,gr96,gr104
+ addc gr105,gr97,gr105
+ addc gr106,gr98,gr106
+ addc gr107,gr99,gr107
+ addc gr108,gr100,gr108
+ addc gr109,gr101,gr109
+ addc gr110,gr102,gr110
+ addc gr111,gr103,gr111
+ addc gr120,gr120,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr104,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ load 0,0,gr119,lr2
+ add gr117,gr117,gr120
+ addc gr118,gr118,0
+ add gr117,gr117,gr119
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/gmp/mpn/a29k/lshift.s b/gmp/mpn/a29k/lshift.s
new file mode 100644
index 0000000000..3df6dabfe4
--- /dev/null
+++ b/gmp/mpn/a29k/lshift.s
@@ -0,0 +1,91 @@
+; 29000 __gmpn_lshift --
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_lshift
+ .word 0x60000
+___gmpn_lshift:
+ sll gr116,lr4,2
+ add lr3,gr116,lr3
+ add lr2,gr116,lr2
+ sub lr3,lr3,4
+ load 0,0,gr119,lr3
+
+ subr gr116,lr5,32
+ srl gr96,gr119,gr116 ; return value
+ sub lr4,lr4,1 ; actual loop count is SIZE - 1
+
+ srl gr117,lr4,3 ; chuck count = (actual count) / 8
+ cpeq gr118,gr117,0
+ jmpt gr118,Ltail
+ mtsr fc,lr5
+
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: sub lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr100,lr3
+
+ extract gr109,gr119,gr107
+ extract gr108,gr107,gr106
+ extract gr107,gr106,gr105
+ extract gr106,gr105,gr104
+ extract gr105,gr104,gr103
+ extract gr104,gr103,gr102
+ extract gr103,gr102,gr101
+ extract gr102,gr101,gr100
+
+ sub lr2,lr2,32
+ mtsrim cr,(8-1)
+ storem 0,0,gr102,lr2
+ jmpfdec gr117,Loop
+ or gr119,gr100,0
+
+; Code for the last up-to-7 limbs.
+
+ and lr4,lr4,(8-1)
+Ltail: cpeq gr118,lr4,0
+ jmpt gr118,Lend
+ sub lr4,lr4,2 ; count for jmpfdec
+
+Loop2: sub lr3,lr3,4
+ load 0,0,gr116,lr3
+ extract gr117,gr119,gr116
+ sub lr2,lr2,4
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ or gr119,gr116,0
+
+Lend: extract gr117,gr119,0
+ sub lr2,lr2,4
+ jmpi lr0
+ store 0,0,gr117,lr2
diff --git a/gmp/mpn/a29k/mul_1.s b/gmp/mpn/a29k/mul_1.s
new file mode 100644
index 0000000000..a55fe3e367
--- /dev/null
+++ b/gmp/mpn/a29k/mul_1.s
@@ -0,0 +1,95 @@
+; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_mul_1
+ .word 0x60000
+___gmpn_mul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr104,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ add gr117,gr117,gr120
+ store 0,0,gr117,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/gmp/mpn/a29k/rshift.s b/gmp/mpn/a29k/rshift.s
new file mode 100644
index 0000000000..8a3086755a
--- /dev/null
+++ b/gmp/mpn/a29k/rshift.s
@@ -0,0 +1,87 @@
+; 29000 __gmpn_rshift --
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_rshift
+ .word 0x60000
+___gmpn_rshift:
+ load 0,0,gr119,lr3
+ add lr3,lr3,4
+
+ subr gr116,lr5,32
+ sll gr96,gr119,gr116 ; return value
+ sub lr4,lr4,1 ; actual loop count is SIZE - 1
+
+ srl gr117,lr4,3 ; chuck count = (actual count) / 8
+ cpeq gr118,gr117,0
+ jmpt gr118,Ltail
+ mtsr fc,gr116
+
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr100,lr3
+ add lr3,lr3,32
+
+ extract gr98,gr100,gr119
+ extract gr99,gr101,gr100
+ extract gr100,gr102,gr101
+ extract gr101,gr103,gr102
+ extract gr102,gr104,gr103
+ extract gr103,gr105,gr104
+ extract gr104,gr106,gr105
+ extract gr105,gr107,gr106
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr98,lr2
+ add lr2,lr2,32
+ jmpfdec gr117,Loop
+ or gr119,gr107,0
+
+; Code for the last up-to-7 limbs.
+
+ and lr4,lr4,(8-1)
+Ltail: cpeq gr118,lr4,0
+ jmpt gr118,Lend
+ sub lr4,lr4,2 ; count for jmpfdec
+
+Loop2: load 0,0,gr100,lr3
+ add lr3,lr3,4
+ extract gr117,gr100,gr119
+ store 0,0,gr117,lr2
+ add lr2,lr2,4
+ jmpfdec lr4,Loop2
+ or gr119,gr100,0
+
+Lend: srl gr117,gr119,lr5
+ jmpi lr0
+ store 0,0,gr117,lr2
diff --git a/gmp/mpn/a29k/sub_n.s b/gmp/mpn/a29k/sub_n.s
new file mode 100644
index 0000000000..42072a494d
--- /dev/null
+++ b/gmp/mpn/a29k/sub_n.s
@@ -0,0 +1,118 @@
+; 29000 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; s2_ptr lr4
+; size lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers. gr116 is used for this purpose. Note that
+; gr116==0 means that carry should be set.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_sub_n
+ .word 0x60000
+___gmpn_sub_n:
+ srl gr117,lr5,3
+ sub gr118,gr117,1
+ jmpt gr118,Ltail
+ constn gr116,-1 ; init cy reg
+ sub gr117,gr117,2 ; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+ mtsrim cr,(8-1)
+ loadm 0,0,gr104,lr4
+ add lr4,lr4,32
+
+ subr gr116,gr116,0 ; restore carry
+ subc gr96,gr96,gr104
+ subc gr97,gr97,gr105
+ subc gr98,gr98,gr106
+ subc gr99,gr99,gr107
+ subc gr100,gr100,gr108
+ subc gr101,gr101,gr109
+ subc gr102,gr102,gr110
+ subc gr103,gr103,gr111
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+ and lr5,lr5,(8-1)
+Ltail: sub gr118,lr5,1 ; count for CR
+ jmpt gr118,Lend
+ sub gr117,lr5,2 ; count for jmpfdec
+
+ mtsr cr,gr118
+ loadm 0,0,gr96,lr3
+ mtsr cr,gr118
+ loadm 0,0,gr104,lr4
+
+ subr gr116,gr116,0 ; restore carry
+
+ jmpfdec gr117,L1
+ subc gr96,gr96,gr104
+ jmp Lstore
+ mtsr cr,gr118
+L1: jmpfdec gr117,L2
+ subc gr97,gr97,gr105
+ jmp Lstore
+ mtsr cr,gr118
+L2: jmpfdec gr117,L3
+ subc gr98,gr98,gr106
+ jmp Lstore
+ mtsr cr,gr118
+L3: jmpfdec gr117,L4
+ subc gr99,gr99,gr107
+ jmp Lstore
+ mtsr cr,gr118
+L4: jmpfdec gr117,L5
+ subc gr100,gr100,gr108
+ jmp Lstore
+ mtsr cr,gr118
+L5: jmpfdec gr117,L6
+ subc gr101,gr101,gr109
+ jmp Lstore
+ mtsr cr,gr118
+L6: subc gr102,gr102,gr110
+
+Lstore: storem 0,0,gr96,lr2
+ subc gr116,gr116,gr116 ; gr116 = not(cy)
+
+Lend: jmpi lr0
+ add gr96,gr116,1
diff --git a/gmp/mpn/a29k/submul_1.s b/gmp/mpn/a29k/submul_1.s
new file mode 100644
index 0000000000..7955b89537
--- /dev/null
+++ b/gmp/mpn/a29k/submul_1.s
@@ -0,0 +1,114 @@
+; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+; subtract the product from a second limb vector.
+
+; Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr lr2
+; s1_ptr lr3
+; size lr4
+; s2_limb lr5
+
+ .cputype 29050
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___gmpn_submul_1
+ .word 0x60000
+___gmpn_submul_1:
+ sub lr4,lr4,8
+ jmpt lr4,Ltail
+ const gr120,0 ; init cylimb reg
+
+ srl gr117,lr4,3 ; divide by 8
+ sub gr117,gr117,1 ; count for jmpfdec
+
+Loop: mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr3
+ add lr3,lr3,32
+
+ multiplu gr104,gr96,lr5
+ multmu gr96,gr96,lr5
+ multiplu gr105,gr97,lr5
+ multmu gr97,gr97,lr5
+ multiplu gr106,gr98,lr5
+ multmu gr98,gr98,lr5
+ multiplu gr107,gr99,lr5
+ multmu gr99,gr99,lr5
+ multiplu gr108,gr100,lr5
+ multmu gr100,gr100,lr5
+ multiplu gr109,gr101,lr5
+ multmu gr101,gr101,lr5
+ multiplu gr110,gr102,lr5
+ multmu gr102,gr102,lr5
+ multiplu gr111,gr103,lr5
+ multmu gr103,gr103,lr5
+
+ add gr104,gr104,gr120
+ addc gr105,gr105,gr96
+ addc gr106,gr106,gr97
+ addc gr107,gr107,gr98
+ addc gr108,gr108,gr99
+ addc gr109,gr109,gr100
+ addc gr110,gr110,gr101
+ addc gr111,gr111,gr102
+ addc gr120,gr103,0
+
+ mtsrim cr,(8-1)
+ loadm 0,0,gr96,lr2
+
+ sub gr96,gr96,gr104
+ subc gr97,gr97,gr105
+ subc gr98,gr98,gr106
+ subc gr99,gr99,gr107
+ subc gr100,gr100,gr108
+ subc gr101,gr101,gr109
+ subc gr102,gr102,gr110
+ subc gr103,gr103,gr111
+
+ add gr104,gr103,gr111 ; invert carry from previus sub
+ addc gr120,gr120,0
+
+ mtsrim cr,(8-1)
+ storem 0,0,gr96,lr2
+ jmpfdec gr117,Loop
+ add lr2,lr2,32
+
+Ltail: and lr4,lr4,(8-1)
+ sub gr118,lr4,1 ; count for CR
+ jmpt gr118,Lend
+ sub lr4,lr4,2
+ sub lr2,lr2,4 ; offset res_ptr by one limb
+
+Loop2: load 0,0,gr116,lr3
+ add lr3,lr3,4
+ multiplu gr117,gr116,lr5
+ multmu gr118,gr116,lr5
+ add lr2,lr2,4
+ load 0,0,gr119,lr2
+ add gr117,gr117,gr120
+ addc gr118,gr118,0
+ sub gr119,gr119,gr117
+ add gr104,gr119,gr117 ; invert carry from previus sub
+ store 0,0,gr119,lr2
+ jmpfdec lr4,Loop2
+ addc gr120,gr118,0
+
+Lend: jmpi lr0
+ or gr96,gr120,0 ; copy
diff --git a/gmp/mpn/a29k/udiv.s b/gmp/mpn/a29k/udiv.s
new file mode 100644
index 0000000000..82c3925a42
--- /dev/null
+++ b/gmp/mpn/a29k/udiv.s
@@ -0,0 +1,28 @@
+; Copyright 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___udiv_qrnnd
+ .word 0x60000
+___udiv_qrnnd:
+ mtsr q,lr3
+ dividu gr96,lr4,lr5
+ mfsr gr116,q
+ jmpi lr0
+ store 0,0,gr116,lr2
diff --git a/gmp/mpn/a29k/umul.s b/gmp/mpn/a29k/umul.s
new file mode 100644
index 0000000000..02c34e9151
--- /dev/null
+++ b/gmp/mpn/a29k/umul.s
@@ -0,0 +1,27 @@
+; Copyright 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+ .sect .lit,lit
+ .text
+ .align 4
+ .global ___umul_ppmm
+ .word 0x50000
+___umul_ppmm:
+ multiplu gr116,lr3,lr4
+ multmu gr96,lr3,lr4
+ jmpi lr0
+ store 0,0,gr116,lr2
diff --git a/gmp/mpn/alpha/README b/gmp/mpn/alpha/README
index 09c2f04047..3578c53b85 100644
--- a/gmp/mpn/alpha/README
+++ b/gmp/mpn/alpha/README
@@ -1,30 +1,20 @@
-Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc.
+Copyright 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2004, 2005 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -46,7 +36,7 @@ Cray T3 code is very very different...
them to "$6" or "$f6" where necessary.
"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is
-required. The X() macro accommodates this difference.
+required. The X() macro accomodates this difference.
"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will
accept either. We use cvttqc and have an m4 define expand to cvttq/c where
@@ -70,7 +60,7 @@ RELEVANT OPTIMIZATION ISSUES
EV4
1. This chip has very limited store bandwidth. The on-chip L1 cache is write-
- through, and a cache line is transferred from the store buffer to the off-
+ through, and a cache line is transfered from the store buffer to the off-
chip L2 in as much 15 cycles on most systems. This delay hurts mpn_add_n,
mpn_sub_n, mpn_lshift, and mpn_rshift.
diff --git a/gmp/mpn/alpha/add_n.asm b/gmp/mpn/alpha/add_n.asm
index bc572a57a9..77d4cad2ef 100644
--- a/gmp/mpn/alpha/add_n.asm
+++ b/gmp/mpn/alpha/add_n.asm
@@ -1,164 +1,117 @@
dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
dnl store sum in a third limb vector.
-dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2002, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
-C EV5: 4.75
-C EV6: 3
+C EV4: 7.75
+C EV5: 5.75
+C EV6: 4
-dnl INPUT PARAMETERS
-dnl res_ptr r16
-dnl s1_ptr r17
-dnl s2_ptr r18
-dnl size r19
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
ASM_START()
-PROLOGUE(mpn_add_nc)
- bis r20,r31,r25
- br L(com)
-EPILOGUE()
PROLOGUE(mpn_add_n)
- bis r31,r31,r25 C clear cy
-L(com): subq r19,4,r19 C decr loop cnt
- blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
-C Start software pipeline for 1st loop
- ldq r0,0(r18)
- ldq r4,0(r17)
- ldq r1,8(r18)
- ldq r5,8(r17)
- addq r17,32,r17 C update s1_ptr
- addq r0,r4,r28 C 1st main add
- ldq r2,16(r18)
- addq r25,r28,r20 C 1st carry add
- ldq r3,24(r18)
- cmpult r28,r4,r8 C compute cy from last add
- ldq r6,-16(r17)
- cmpult r20,r28,r25 C compute cy from last add
- ldq r7,-8(r17)
- bis r8,r25,r25 C combine cy from the two adds
- subq r19,4,r19 C decr loop cnt
- addq r1,r5,r28 C 2nd main add
- addq r18,32,r18 C update s2_ptr
- addq r28,r25,r21 C 2nd carry add
- cmpult r28,r5,r8 C compute cy from last add
- blt r19,$Lend1 C if less than 4 limbs remain, jump
-C 1st loop handles groups of 4 limbs in a software pipeline
- ALIGN(16)
-$Loop: cmpult r21,r28,r25 C compute cy from last add
- ldq r0,0(r18)
- bis r8,r25,r25 C combine cy from the two adds
- ldq r1,8(r18)
- addq r2,r6,r28 C 3rd main add
- ldq r4,0(r17)
- addq r28,r25,r22 C 3rd carry add
- ldq r5,8(r17)
- cmpult r28,r6,r8 C compute cy from last add
- cmpult r22,r28,r25 C compute cy from last add
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two adds
- stq r21,8(r16)
- addq r3,r7,r28 C 4th main add
- addq r28,r25,r23 C 4th carry add
- cmpult r28,r7,r8 C compute cy from last add
- cmpult r23,r28,r25 C compute cy from last add
- addq r17,32,r17 C update s1_ptr
- bis r8,r25,r25 C combine cy from the two adds
- addq r16,32,r16 C update res_ptr
- addq r0,r4,r28 C 1st main add
- ldq r2,16(r18)
- addq r25,r28,r20 C 1st carry add
- ldq r3,24(r18)
- cmpult r28,r4,r8 C compute cy from last add
- ldq r6,-16(r17)
- cmpult r20,r28,r25 C compute cy from last add
- ldq r7,-8(r17)
- bis r8,r25,r25 C combine cy from the two adds
- subq r19,4,r19 C decr loop cnt
- stq r22,-16(r16)
- addq r1,r5,r28 C 2nd main add
- stq r23,-8(r16)
- addq r25,r28,r21 C 2nd carry add
- addq r18,32,r18 C update s2_ptr
- cmpult r28,r5,r8 C compute cy from last add
- bge r19,$Loop
-C Finish software pipeline for 1st loop
-$Lend1: cmpult r21,r28,r25 C compute cy from last add
- bis r8,r25,r25 C combine cy from the two adds
- addq r2,r6,r28 C 3rd main add
- addq r28,r25,r22 C 3rd carry add
- cmpult r28,r6,r8 C compute cy from last add
- cmpult r22,r28,r25 C compute cy from last add
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two adds
- stq r21,8(r16)
- addq r3,r7,r28 C 4th main add
- addq r28,r25,r23 C 4th carry add
- cmpult r28,r7,r8 C compute cy from last add
- cmpult r23,r28,r25 C compute cy from last add
- bis r8,r25,r25 C combine cy from the two adds
- addq r16,32,r16 C update res_ptr
- stq r22,-16(r16)
- stq r23,-8(r16)
-$Lend2: addq r19,4,r19 C restore loop cnt
- beq r19,$Lret
-C Start software pipeline for 2nd loop
- ldq r0,0(r18)
- ldq r4,0(r17)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
subq r19,1,r19
- beq r19,$Lend0
-C 2nd loop handles remaining 1-3 limbs
- ALIGN(16)
-$Loop0: addq r0,r4,r28 C main add
- ldq r0,8(r18)
- cmpult r28,r4,r8 C compute cy from last add
- ldq r4,8(r17)
- addq r28,r25,r20 C carry add
- addq r18,8,r18
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
addq r17,8,r17
- stq r20,0(r16)
- cmpult r20,r28,r25 C compute cy from last add
- subq r19,1,r19 C decr loop cnt
- bis r8,r25,r25 C combine cy from the two adds
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
addq r16,8,r16
- bne r19,$Loop0
-$Lend0: addq r0,r4,r28 C main add
- addq r28,r25,r20 C carry add
- cmpult r28,r4,r8 C compute cy from last add
- cmpult r20,r28,r25 C compute cy from last add
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two adds
-
-$Lret: bis r25,r31,r0 C return cy
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
ret r31,(r26),1
-EPILOGUE()
+EPILOGUE(mpn_add_n)
ASM_END()
diff --git a/gmp/mpn/alpha/addmul_1.asm b/gmp/mpn/alpha/addmul_1.asm
index c4e6834b61..22c41a5c74 100644
--- a/gmp/mpn/alpha/addmul_1.asm
+++ b/gmp/mpn/alpha/addmul_1.asm
@@ -4,30 +4,19 @@ dnl result to a second limb vector.
dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/alpha-defs.m4 b/gmp/mpn/alpha/alpha-defs.m4
index af34c9294c..b2f9a242a8 100644
--- a/gmp/mpn/alpha/alpha-defs.m4
+++ b/gmp/mpn/alpha/alpha-defs.m4
@@ -3,32 +3,21 @@ divert(-1)
dnl m4 macros for Alpha assembler.
dnl Copyright 2003, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Usage: ASSERT([reg] [,code])
diff --git a/gmp/mpn/alpha/aorslsh1_n.asm b/gmp/mpn/alpha/aorslsh1_n.asm
index 9525e669db..3694f78761 100644
--- a/gmp/mpn/alpha/aorslsh1_n.asm
+++ b/gmp/mpn/alpha/aorslsh1_n.asm
@@ -1,40 +1,36 @@
dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
-dnl Copyright 2003, 2013 Free Software Foundation, Inc.
+dnl Copyright 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
+C EV4: 12.5
C EV5: 6.25
-C EV6: 4.5
+C EV6: 4.375 (i.e., worse than separate mpn_lshift and mpn_add_n at 3.875)
+C TODO
+C * Write special version for ev6, as this is a slowdown for 100 < n < 2200
+C compared to separate mpn_lshift and mpn_add_n.
+C * Use addq instead of sll for left shift, and similarly cmplt instead of srl
+C for right shift.
+
+dnl INPUT PARAMETERS
define(`rp',`r16')
define(`up',`r17')
define(`vp',`r18')
@@ -42,8 +38,12 @@ define(`n', `r19')
define(`u0', `r8')
define(`u1', `r1')
+define(`u2', `r2')
+define(`u3', `r3')
define(`v0', `r4')
define(`v1', `r5')
+define(`v2', `r6')
+define(`v3', `r7')
define(`cy0', `r0')
define(`cy1', `r20')
@@ -67,98 +67,168 @@ MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
ASM_START()
PROLOGUE(func)
- and n, 2, cy0
- blbs n, L(bx1)
-L(bx0): ldq v1, 0(vp)
+ lda n, -4(n)
+ bis r31, r31, cy1
+ and n, 3, r1
+ beq r1, $Lb00
+ cmpeq r1, 1, r2
+ bne r2, $Lb01
+ cmpeq r1, 2, r2
+ bne r2, $Lb10
+$Lb11: C n = 3, 7, 11, ...
+ ldq v0, 0(vp)
+ ldq u0, 0(up)
+ ldq v1, 8(vp)
+ ldq u1, 8(up)
+ ldq v2, 16(vp)
+ ldq u2, 16(up)
+ lda vp, 24(vp)
+ lda up, 24(up)
+ bge n, $Loop
+ br r31, $Lcj3
+$Lb10: C n = 2, 6, 10, ...
+ bis r31, r31, cy0
+ ldq v1, 0(vp)
ldq u1, 0(up)
- nop
- bne cy0, L(b10)
-
-L(b00): lda vp, 48(vp)
- lda up, -16(up)
+ ldq v2, 8(vp)
+ ldq u2, 8(up)
lda rp, -8(rp)
- br r31, L(lo0)
-
-L(b10): lda vp, 32(vp)
+ blt n, $Lcj2
+ ldq v3, 16(vp)
+ ldq u3, 16(up)
+ lda vp, 48(vp)
+ lda up, 16(up)
+ br r31, $LL10
+$Lb01: C n = 1, 5, 9, ...
+ ldq v2, 0(vp)
+ ldq u2, 0(up)
+ lda rp, -16(rp)
+ blt n, $Lcj1
+ ldq v3, 8(vp)
+ ldq u3, 8(up)
+ ldq v0, 16(vp)
+ ldq u0, 16(up)
+ lda vp, 40(vp)
+ lda up, 8(up)
+ lda rp, 32(rp)
+ br r31, $LL01
+$Lb00: C n = 4, 8, 12, ...
+ bis r31, r31, cy0
+ ldq v3, 0(vp)
+ ldq u3, 0(up)
+ ldq v0, 8(vp)
+ ldq u0, 8(up)
+ ldq v1, 16(vp)
+ ldq u1, 16(up)
+ lda vp, 32(vp)
lda rp, 8(rp)
- lda cy0, 0(r31)
- br r31, L(lo2)
-
-L(bx1): ldq v0, 0(vp)
- ldq u0, 0(up)
- lda cy1, 0(r31)
- beq cy0, L(b01)
-
-L(b11): lda vp, 40(vp)
- lda up, -24(up)
- lda rp, 16(rp)
- br r31, L(lo3)
-
-L(b01): lda n, -4(n)
- ble n, L(end)
- lda vp, 24(vp)
- lda up, -8(up)
-
+ br r31, $LL00x
ALIGN(16)
-L(top): addq v0, v0, sl C left shift vlimb
- ldq v1, -16(vp)
+C 0
+$Loop: sll v0, 1, sl C left shift vlimb
+ ldq v3, 0(vp)
+C 1
ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
- cmplt v0, r31, cy0 C carry out #1
- ldq u1, 16(up)
+ ldq u3, 0(up)
+C 2
ADDSUB ps, cy1, rr C consume carry from previous operation
+ srl v0, 63, cy0 C carry out #1
+C 3
CARRY( ps, u0, cy) C carry out #2
stq rr, 0(rp)
+C 4
addq cy, cy0, cy0 C combine carry out #1 and #2
CARRY( rr, ps, cy) C carry out #3
+C 5
addq cy, cy0, cy0 C final carry out
lda vp, 32(vp) C bookkeeping
-L(lo0): addq v1, v1, sl
- ldq v0, -40(vp)
+C 6
+$LL10: sll v1, 1, sl
+ ldq v0, -24(vp)
+C 7
ADDSUB u1, sl, ps
- cmplt v1, r31, cy1
- ldq u0, 24(up)
+ ldq u0, 8(up)
+C 8
ADDSUB ps, cy0, rr
+ srl v1, 63, cy1
+C 9
CARRY( ps, u1, cy)
stq rr, 8(rp)
+C 10
addq cy, cy1, cy1
CARRY( rr, ps, cy)
+C 11
addq cy, cy1, cy1
lda rp, 32(rp) C bookkeeping
-L(lo3): addq v0, v0, sl
- ldq v1, -32(vp)
- ADDSUB u0, sl, ps
- cmplt v0, r31, cy0
- ldq u1, 32(up)
+C 12
+$LL01: sll v2, 1, sl
+ ldq v1, -16(vp)
+C 13
+ ADDSUB u2, sl, ps
+ ldq u1, 16(up)
+C 14
ADDSUB ps, cy1, rr
- CARRY( ps, u0, cy)
+ srl v2, 63, cy0
+C 15
+ CARRY( ps, u2, cy)
stq rr, -16(rp)
+C 16
addq cy, cy0, cy0
CARRY( rr, ps, cy)
+C 17
addq cy, cy0, cy0
- lda up, 32(up) C bookkeeping
-L(lo2): addq v1, v1, sl
- ldq v0, -24(vp)
- ADDSUB u1, sl, ps
- cmplt v1, r31, cy1
- ldq u0, 8(up)
+$LL00x: lda up, 32(up) C bookkeeping
+C 18
+ sll v3, 1, sl
+ ldq v2, -8(vp)
+C 19
+ ADDSUB u3, sl, ps
+ ldq u2, -8(up)
+C 20
ADDSUB ps, cy0, rr
- CARRY( ps, u1, cy)
+ srl v3, 63, cy1
+C 21
+ CARRY( ps, u3, cy)
stq rr, -8(rp)
+C 22
addq cy, cy1, cy1
CARRY( rr, ps, cy)
+C 23
addq cy, cy1, cy1
lda n, -4(n) C bookkeeping
- bgt n, L(top)
+C 24
+ bge n, $Loop
-L(end): addq v0, v0, sl
+$Lcj3: sll v0, 1, sl
ADDSUB u0, sl, ps
ADDSUB ps, cy1, rr
- cmplt v0, r31, cy0
+ srl v0, 63, cy0
CARRY( ps, u0, cy)
stq rr, 0(rp)
addq cy, cy0, cy0
CARRY( rr, ps, cy)
- addq cy, cy0, r0
+ addq cy, cy0, cy0
+
+$Lcj2: sll v1, 1, sl
+ ADDSUB u1, sl, ps
+ ADDSUB ps, cy0, rr
+ srl v1, 63, cy1
+ CARRY( ps, u1, cy)
+ stq rr, 8(rp)
+ addq cy, cy1, cy1
+ CARRY( rr, ps, cy)
+ addq cy, cy1, cy1
+
+$Lcj1: sll v2, 1, sl
+ ADDSUB u2, sl, ps
+ ADDSUB ps, cy1, rr
+ srl v2, 63, cy0
+ CARRY( ps, u2, cy)
+ stq rr, 16(rp)
+ addq cy, cy0, cy0
+ CARRY( rr, ps, cy)
+ addq cy, cy0, cy0
+
ret r31,(r26),1
EPILOGUE()
ASM_END()
diff --git a/gmp/mpn/alpha/aorslsh2_n.asm b/gmp/mpn/alpha/aorslsh2_n.asm
deleted file mode 100644
index bdee1d6d02..0000000000
--- a/gmp/mpn/alpha/aorslsh2_n.asm
+++ /dev/null
@@ -1,167 +0,0 @@
-dnl Alpha mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
-
-dnl Copyright 2003, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C EV4: ?
-C EV5: 6
-C EV6: 3.75
-
-C TODO
-C * Tune to reach 3.5 c/l on ev6 and 5.75 c/l on ev5.
-
-define(`rp',`r16')
-define(`up',`r17')
-define(`vp',`r18')
-define(`n', `r19')
-
-define(`u0', `r8')
-define(`u1', `r1')
-define(`v0', `r4')
-define(`v1', `r5')
-
-define(`cy0', `r0')
-define(`cy1', `r20')
-define(`cy', `r22')
-define(`rr', `r24')
-define(`ps', `r25')
-define(`sl', `r28')
-
-ifdef(`OPERATION_addlsh2_n',`
- define(ADDSUB, addq)
- define(CARRY, `cmpult $1,$2,$3')
- define(func, mpn_addlsh2_n)
-')
-ifdef(`OPERATION_sublsh2_n',`
- define(ADDSUB, subq)
- define(CARRY, `cmpult $2,$1,$3')
- define(func, mpn_sublsh2_n)
-')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
-
-ASM_START()
-PROLOGUE(func)
- and n, 2, cy0
- blbs n, L(bx1)
-L(bx0): ldq v1, 0(vp)
- ldq u1, 0(up)
- bis r31, r31, r2
- bne cy0, L(b10)
-
-L(b00): lda vp, 48(vp)
- lda up, -16(up)
- lda rp, -8(rp)
- s4addq v1, r31, sl
- br r31, L(lo0)
-
-L(b10): lda vp, 32(vp)
- lda rp, 8(rp)
- lda cy0, 0(r31)
- br r31, L(lo2)
-
-L(bx1): ldq v0, 0(vp)
- ldq u0, 0(up)
- lda cy1, 0(r31)
- bis r31, r31, r3
- nop
- beq cy0, L(b01)
-
-L(b11): lda vp, 40(vp)
- lda up, -24(up)
- lda rp, 16(rp)
- br r31, L(lo3)
-
-L(b01): lda n, -4(n)
- ble n, L(end)
- lda vp, 24(vp)
- lda up, -8(up)
-
- ALIGN(16)
-L(top): s4addq v0, r3, sl C combined vlimb
- ldq v1, -16(vp)
- ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
- ldq u1, 16(up)
- srl v0, 62, r2 C high v bits
- ADDSUB ps, cy1, rr C consume carry from previous operation
- CARRY( ps, u0, cy0) C carry out #2
- stq rr, 0(rp)
- CARRY( rr, ps, cy) C carry out #3
- lda vp, 32(vp) C bookkeeping
- addq cy, cy0, cy0 C final carry out
- s4addq v1, r2, sl
-L(lo0): ldq v0, -40(vp)
- ADDSUB u1, sl, ps
- ldq u0, 24(up)
- srl v1, 62, r3
- ADDSUB ps, cy0, rr
- CARRY( ps, u1, cy1)
- stq rr, 8(rp)
- CARRY( rr, ps, cy)
- lda rp, 32(rp) C bookkeeping
- addq cy, cy1, cy1
-L(lo3): s4addq v0, r3, sl
- ldq v1, -32(vp)
- ADDSUB u0, sl, ps
- ldq u1, 32(up)
- srl v0, 62, r2
- ADDSUB ps, cy1, rr
- CARRY( ps, u0, cy0)
- stq rr, -16(rp)
- CARRY( rr, ps, cy)
- lda up, 32(up) C bookkeeping
- addq cy, cy0, cy0
-L(lo2): s4addq v1, r2, sl
- ldq v0, -24(vp)
- ADDSUB u1, sl, ps
- ldq u0, 8(up)
- srl v1, 62, r3
- ADDSUB ps, cy0, rr
- CARRY( ps, u1, cy1)
- stq rr, -8(rp)
- CARRY( rr, ps, cy)
- lda n, -4(n) C bookkeeping
- addq cy, cy1, cy1
- bgt n, L(top)
-
-L(end): s4addq v0, r3, sl
- ADDSUB u0, sl, ps
- srl v0, 62, r2
- ADDSUB ps, cy1, rr
- CARRY( ps, u0, cy0)
- stq rr, 0(rp)
- CARRY( rr, ps, cy)
- addq cy, cy0, cy0
- addq cy0, r2, r0
-
- ret r31,(r26),1
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/alpha/bdiv_dbm1c.asm b/gmp/mpn/alpha/bdiv_dbm1c.asm
index 472966ca98..e5f11dbf48 100644
--- a/gmp/mpn/alpha/bdiv_dbm1c.asm
+++ b/gmp/mpn/alpha/bdiv_dbm1c.asm
@@ -3,30 +3,19 @@ dnl Alpha mpn_bdiv_dbm1c.
dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/cntlz.asm b/gmp/mpn/alpha/cntlz.asm
index 25af19b131..2bfd923e5e 100644
--- a/gmp/mpn/alpha/cntlz.asm
+++ b/gmp/mpn/alpha/cntlz.asm
@@ -3,30 +3,19 @@ dnl Alpha auxiliary for longlong.h's count_leading_zeros
dnl Copyright 1997, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/copyd.asm b/gmp/mpn/alpha/copyd.asm
index b41b5366cc..ba8fa1c633 100644
--- a/gmp/mpn/alpha/copyd.asm
+++ b/gmp/mpn/alpha/copyd.asm
@@ -3,30 +3,19 @@ dnl Alpha mpn_copyd -- copy, decrementing.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/copyi.asm b/gmp/mpn/alpha/copyi.asm
index f7e2ad6f6a..425804127e 100644
--- a/gmp/mpn/alpha/copyi.asm
+++ b/gmp/mpn/alpha/copyi.asm
@@ -3,30 +3,19 @@ dnl Alpha mpn_copyi -- copy, incrementing.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/default.m4 b/gmp/mpn/alpha/default.m4
index 8fe7c4e122..e7aae2eeea 100644
--- a/gmp/mpn/alpha/default.m4
+++ b/gmp/mpn/alpha/default.m4
@@ -3,33 +3,22 @@ divert(-1)
dnl m4 macros for alpha assembler (everywhere except unicos).
-dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Usage: ASM_START()
@@ -64,9 +53,8 @@ ifelse(`$2',noalign,,` ALIGN(16)')
.globl $1
.ent $1
$1:
- .frame r30,0,r26,0
-ifelse(`$2',gp,` ldgp r29, 0(r27)
-`$'$1..ng:')
+ifelse(`$2',gp,` ldgp r29,0(r27)')
+ .frame r30,0,r26
.prologue ifelse(`$2',gp,1,0)')
define(`EPILOGUE_cpu',
@@ -102,13 +90,12 @@ forloop(i,0,31,`defreg(`r'i,$i)')
forloop(i,0,31,`deflit(`f'i,``$f''i)')
-dnl Usage: DATASTART(name,align) or DATASTART(name)
+dnl Usage: DATASTART(name)
dnl DATAEND()
define(`DATASTART',
-m4_assert_numargs_range(1,2)
-` RODATA
- ALIGN(ifelse($#,1,2,$2))
+m4_assert_numargs(1)
+` DATA
$1:')
define(`DATAEND',
m4_assert_numargs(0)
@@ -117,7 +104,7 @@ m4_assert_numargs(0)
dnl Load a symbolic address into a register
define(`LEA',
m4_assert_numargs(2)
-`lda $1, $2')
+`lda $1, $2')
dnl Usage: ASM_END()
define(`ASM_END',
diff --git a/gmp/mpn/alpha/dive_1.c b/gmp/mpn/alpha/dive_1.c
index 88b82db2f7..a915c58a9e 100644
--- a/gmp/mpn/alpha/dive_1.c
+++ b/gmp/mpn/alpha/dive_1.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2003 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/alpha/ev5/diveby3.asm b/gmp/mpn/alpha/diveby3.asm
index 3758188e02..e2d1c6beee 100644
--- a/gmp/mpn/alpha/ev5/diveby3.asm
+++ b/gmp/mpn/alpha/diveby3.asm
@@ -1,42 +1,32 @@
dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
-dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C EV4: 22
C EV5: 11.5
-C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster
+C EV6: 6.3
C TODO
-C * Remove the unops, they benefit just ev6, which no longer uses this file.
+C * Trim this to 6.0 c/l for ev6.
+C * Write special ev5 version, should reach 9 c/l, and could be smaller.
C * Try prefetch for destination, using lds.
C * Improve feed-in code, by moving initial mulq earlier; make initial load
C to u0/u0 to save some copying.
@@ -50,7 +40,7 @@ define(`cy', `r19')
ASM_START()
-DATASTART(L(LC),8)
+DATASTART(L(LC))
.quad 0xAAAAAAAAAAAAAAAB
.quad 0x5555555555555555
.quad 0xAAAAAAAAAAAAAAAA
diff --git a/gmp/mpn/alpha/divrem_2.asm b/gmp/mpn/alpha/divrem_2.asm
index 046b246a95..b68468bca0 100644
--- a/gmp/mpn/alpha/divrem_2.asm
+++ b/gmp/mpn/alpha/divrem_2.asm
@@ -1,32 +1,21 @@
dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
-dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -52,7 +41,8 @@ define(`un_param', `r19')
define(`dp', `r20')
ASM_START()
-PROLOGUE(mpn_divrem_2,gp)
+PROLOGUE(mpn_divrem_2)
+ ldgp r29, 0(r27)
lda r30, -80(r30)
stq r26, 0(r30)
stq r9, 8(r30)
@@ -90,7 +80,7 @@ L(L8): stq r3, 72(r30)
blt r19, L(L10)
bis r31, r12, r16
jsr r26, mpn_invert_limb
- LDGP( r29, 0(r26))
+ ldgp r29, 0(r26)
mulq r0, r12, r4 C t0 = LO(di * d1)
umulh r0, r10, r2 C s1 = HI(di * d0)
addq r4, r10, r4 C t0 += d0
diff --git a/gmp/mpn/alpha/ev5/add_n.asm b/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..626e713ccb
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,146 @@
+dnl Alpha EV5 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 1995, 1999, 2000, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 4.75
+C EV6: 3
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ addq r0,r4,r20 C 1st main add
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r20,r0,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ addq r1,r5,r28 C 2nd main add
+ addq r18,32,r18 C update s2_ptr
+ addq r28,r25,r21 C 2nd carry add
+ cmpult r28,r5,r8 C compute cy from last add
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r21,r28,r25 C compute cy from last add
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two adds
+ ldq r1,8(r18)
+ addq r2,r6,r28 C 3rd main add
+ ldq r4,0(r17)
+ addq r28,r25,r22 C 3rd carry add
+ ldq r5,8(r17)
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ addq r1,r5,r28 C 2nd main add
+ stq r23,-8(r16)
+ addq r25,r28,r21 C 2nd carry add
+ addq r18,32,r18 C update s2_ptr
+ cmpult r28,r5,r8 C compute cy from last add
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r21,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r2,r6,r28 C 3rd main add
+ addq r28,r25,r22 C 3rd carry add
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: addq r0,r4,r28 C main add
+ ldq r0,8(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r4,8(r17)
+ addq r28,r25,r20 C carry add
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r20,r28,r25 C compute cy from last add
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: addq r0,r4,r28 C main add
+ addq r28,r25,r20 C carry add
+ cmpult r28,r4,r8 C compute cy from last add
+ cmpult r20,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/gmp/mpn/alpha/com.asm b/gmp/mpn/alpha/ev5/com_n.asm
index f084ab5e96..979e711eb8 100644
--- a/gmp/mpn/alpha/com.asm
+++ b/gmp/mpn/alpha/ev5/com_n.asm
@@ -1,32 +1,21 @@
-dnl Alpha mpn_com -- mpn one's complement.
+dnl Alpha EV5 mpn_com_n -- mpn one's complement.
dnl Copyright 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -37,7 +26,7 @@ C EV5: 2.0
C EV6: 1.5
-C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop
@@ -71,7 +60,7 @@ FLOAT64(L(dat), 2.0)
ALIGN(16)
-PROLOGUE(mpn_com,gp)
+PROLOGUE(mpn_com_n,gp)
C r16 dst
C r17 src
diff --git a/gmp/mpn/alpha/ev5/gmp-mparam.h b/gmp/mpn/alpha/ev5/gmp-mparam.h
index b560c20afe..cbedd4f173 100644
--- a/gmp/mpn/alpha/ev5/gmp-mparam.h
+++ b/gmp/mpn/alpha/ev5/gmp-mparam.h
@@ -1,187 +1,81 @@
/* Alpha EV5 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
/* 600 MHz 21164A */
-/* FFT tuning limit = 5000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 22
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
-#define USE_PREINV_DIVREM_1 1 /* preinv always */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 76
-
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 50
-#define MUL_TOOM44_THRESHOLD 118
-#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 236
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 22
-#define SQR_TOOM3_THRESHOLD 73
-#define SQR_TOOM4_THRESHOLD 178
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 260
-
-#define MULMID_TOOM42_THRESHOLD 18
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 12
-
-#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 284, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \
- { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
- { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 63, 8}, \
- { 255, 7}, { 511,10}, { 71, 9}, { 143, 8}, \
- { 287, 7}, { 575, 9}, { 159, 8}, { 319,11}, \
- { 47,12}, { 31,11}, { 63, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 175, 9}, { 351, 8}, \
- { 703,10}, { 191, 9}, { 383,10}, { 207, 9}, \
- { 415,12}, { 63,10}, { 255,11}, { 143,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
- { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
- { 383,11}, { 207,10}, { 415,11}, { 223,13}, \
- { 63,11}, { 287,10}, { 575,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,12}, { 191,11}, \
- { 415,12}, { 223,11}, { 447,10}, { 895,11}, \
- { 479,12}, { 287,11}, { 575,12}, { 351,13}, \
- { 191,12}, { 479,13}, { 255,12}, { 575,13}, \
- { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
- { 447,14}, { 255,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 121
-#define MUL_FFT_THRESHOLD 4224
-
-#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 240, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 14, 5}, { 29, 7}, { 9, 6}, { 19, 7}, \
- { 13, 6}, { 27, 8}, { 7, 7}, { 21, 8}, \
- { 11, 7}, { 29, 8}, { 19, 9}, { 11, 8}, \
- { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
- { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287,10}, { 79,11}, { 47,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287,11}, { 79,10}, { 159, 9}, { 319,10}, \
- { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207, 9}, { 415,11}, { 111,10}, { 223,12}, \
- { 63,11}, { 175,12}, { 95,11}, { 207,13}, \
- { 63,12}, { 127,11}, { 287,12}, { 159,11}, \
- { 351,12}, { 191,11}, { 415,12}, { 223,11}, \
- { 447,13}, { 127,12}, { 351,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
- { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,13}, { 319,12}, { 703,13}, \
- { 383,12}, { 831,13}, { 447,14}, { 255,13}, \
- { 511,12}, { 1023,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 105
-#define SQR_FFT_THRESHOLD 3968
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 45
-#define MULLO_MUL_N_THRESHOLD 8397
-
-#define DC_DIV_QR_THRESHOLD 47
-#define DC_DIVAPPR_Q_THRESHOLD 168
-#define DC_BDIV_QR_THRESHOLD 47
-#define DC_BDIV_Q_THRESHOLD 110
-
-#define INV_MULMOD_BNM1_THRESHOLD 26
-#define INV_NEWTON_THRESHOLD 189
-#define INV_APPR_THRESHOLD 181
-
-#define BINV_NEWTON_THRESHOLD 196
-#define REDC_1_TO_REDC_N_THRESHOLD 51
-
-#define MU_DIV_QR_THRESHOLD 1558
-#define MU_DIVAPPR_Q_THRESHOLD 1558
-#define MUPI_DIV_QR_THRESHOLD 90
-#define MU_BDIV_QR_THRESHOLD 855
-#define MU_BDIV_Q_THRESHOLD 1078
-
-#define POWM_SEC_TABLE 1,16,90,452,1221
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 99
-#define HGCD_APPR_THRESHOLD 103
-#define HGCD_REDUCE_THRESHOLD 2899
-#define GCD_DC_THRESHOLD 283
-#define GCDEXT_DC_THRESHOLD 201
-#define JACOBI_BASE_METHOD 3
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 426
-#define SET_STR_PRECOMPUTE_THRESHOLD 1505
-
-#define FAC_DSC_THRESHOLD 1404
-#define FAC_ODD_THRESHOLD 0 /* always */
+
+/* Generated by tuneup.c, 2009-01-15, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 118
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_KARATSUBA_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 77
+#define SQR_TOOM4_THRESHOLD 136
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 44
+#define MULLOW_MUL_N_THRESHOLD 246
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
+#define DIV_DC_THRESHOLD 53
+#define POWM_THRESHOLD 85
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 104
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 3
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 13
+#define MOD_1_2_THRESHOLD 14
+#define MOD_1_4_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define USE_PREINV_MOD_1 1 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 20
+#define GET_STR_PRECOMPUTE_THRESHOLD 32
+#define SET_STR_DC_THRESHOLD 532
+#define SET_STR_PRECOMPUTE_THRESHOLD 1501
+
+#define MUL_FFT_TABLE { 240, 480, 1344, 1792, 5120, 20480, 81920, 196608, 0 }
+#define MUL_FFT_MODF_THRESHOLD 240
+#define MUL_FFT_THRESHOLD 1920
+
+#define SQR_FFT_TABLE { 240, 480, 1216, 1792, 5120, 12288, 81920, 196608, 0 }
+#define SQR_FFT_MODF_THRESHOLD 208
+#define SQR_FFT_THRESHOLD 1408
+
+/* These tables need to be updated. */
+
+#define MUL_FFT_TABLE2 {{1, 4}, {177, 5}, {193, 4}, {209, 5}, {353, 6}, {385, 5}, {417, 6}, {833, 7}, {897, 6}, {961, 7}, {1025, 6}, {1089, 7}, {1665, 8}, {1793, 7}, {2177, 8}, {2305, 7}, {2433, 8}, {2817, 7}, {2945, 8}, {3329, 9}, {3457, 8}, {4865, 9}, {5633, 8}, {6401, 10}, {7169, 9}, {11777, 10}, {12801, 9}, {13825, 10}, {15361, 9}, {19969, 10}, {23553, 9}, {24065, 11}, {30721, 10}, {48129, 11}, {63489, 10}, {72705, 11}, {96257, 12}, {126977, 11}, {194561, 12}, {258049, 11}, {325633, 12}, {389121, 13}, {516097, 12}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1, 4}, {177, 5}, {193, 4}, {209, 5}, {353, 6}, {385, 5}, {417, 6}, {961, 7}, {1025, 6}, {1089, 7}, {1153, 6}, {1217, 7}, {1665, 8}, {1793, 7}, {2177, 8}, {2305, 7}, {2561, 8}, {2817, 7}, {2945, 8}, {3329, 9}, {3585, 8}, {5377, 9}, {5633, 8}, {6401, 9}, {6657, 10}, {6913, 9}, {11777, 10}, {13313, 9}, {13825, 10}, {15361, 9}, {18945, 10}, {19457, 9}, {19969, 10}, {23553, 9}, {24065, 11}, {30721, 10}, {48129, 11}, {53249, 10}, {56321, 11}, {63489, 10}, {72705, 11}, {73729, 10}, {79873, 11}, {96257, 12}, {126977, 11}, {194561, 12}, {258049, 11}, {325633, 12}, {389121, 13}, {516097, 12}, {1699841, 13}, {1708033, 12}, {1732609, 13}, {1748993, 12}, {1757185, 13}, {1773569, 12}, {1777665, 13}, {1781761, 12}, {1789953, 13}, {1806337, 12}, {1818625, 13}, {1822721, 12}, {1826817, 13}, {1830913, 12}, {1961985, 13}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/alpha/ev5/lshift.asm b/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..04385d3484
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,171 @@
+dnl Alpha EV5 mpn_lshift -- Shift a number left.
+
+dnl Copyright 1994, 1995, 2000, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 3.25
+C EV6: 1.75
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C cnt r19
+
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r31,r19,r20
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ srl r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,-16(r17)
+ subq r16,8,r16
+ sll r4,r19,r5
+ subq r17,8,r17
+ subq r28,1,r28
+ srl r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r28,$Loop0
+
+$L0: sll r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,-16(r17)
+ subq r18,4,r18
+ ldq r2,-24(r17)
+ ldq r3,-32(r17)
+ ldq r4,-40(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ ldq r1,-48(r17)
+ sll r2,r19,r22
+ ldq r2,-56(r17)
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ ldq r3,-64(r17)
+ sll r4,r19,r24
+ ldq r4,-72(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+
+ srl r1,r20,r7
+ subq r18,4,r18
+ sll r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ srl r2,r20,r8
+ ldq r1,-80(r17)
+ sll r2,r19,r22
+ ldq r2,-88(r17)
+
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+
+ srl r3,r20,r5
+ unop C ldq r31,-96(r17)
+ sll r3,r19,r23
+ subq r16,32,r16
+
+ srl r4,r20,r6
+ ldq r3,-96(r17)
+ sll r4,r19,r24
+ ldq r4,-104(r17)
+
+ subq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+ srl r3,r20,r5
+ sll r3,r19,r23
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 2/2
+ stq r7,-40(r16)
+ bis r5,r22,r5
+ stq r8,-48(r16)
+ bis r6,r23,r6
+ stq r5,-56(r16)
+ stq r6,-64(r16)
+C cool down phase 2/3
+ stq r24,-72(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 1/2
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ stq r5,-24(r16)
+ stq r6,-32(r16)
+ stq r24,-40(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev5/rshift.asm b/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..0244da35a5
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,169 @@
+dnl Alpha EV5 mpn_rshift -- Shift a number right.
+
+dnl Copyright 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 3.25
+C EV6: 1.75
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+C cnt r19
+
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ subq r31,r19,r20
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ sll r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,8(r17)
+ addq r16,8,r16
+ srl r4,r19,r5
+ addq r17,8,r17
+ subq r28,1,r28
+ sll r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r28,$Loop0
+
+$L0: srl r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,8(r17)
+ subq r18,4,r18
+ ldq r2,16(r17)
+ ldq r3,24(r17)
+ ldq r4,32(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ ldq r1,40(r17)
+ srl r2,r19,r22
+ ldq r2,48(r17)
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ ldq r3,56(r17)
+ srl r4,r19,r24
+ ldq r4,64(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+
+ sll r1,r20,r7
+ subq r18,4,r18
+ srl r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ sll r2,r20,r8
+ ldq r1,72(r17)
+ srl r2,r19,r22
+ ldq r2,80(r17)
+
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+
+ sll r3,r20,r5
+ unop C ldq r31,-96(r17)
+ srl r3,r19,r23
+ addq r16,32,r16
+
+ sll r4,r20,r6
+ ldq r3,88(r17)
+ srl r4,r19,r24
+ ldq r4,96(r17)
+
+ addq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+ sll r3,r20,r5
+ srl r3,r19,r23
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 2/2
+ stq r7,32(r16)
+ bis r5,r22,r5
+ stq r8,40(r16)
+ bis r6,r23,r6
+ stq r5,48(r16)
+ stq r6,56(r16)
+C cool down phase 2/3
+ stq r24,64(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 1/2
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ stq r5,16(r16)
+ stq r6,24(r16)
+ stq r24,32(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev5/sub_n.asm b/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..2c25fad400
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,146 @@
+dnl Alpha EV5 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright 1995, 1999, 2000, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 4.75
+C EV6: 3
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ subq r4,r0,r20 C 1st main subtract
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r4,r0,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ subq r5,r1,r28 C 2nd main subtract
+ addq r18,32,r18 C update s2_ptr
+ subq r28,r25,r21 C 2nd carry subtract
+ cmpult r5,r1,r8 C compute cy from last subtract
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ ldq r1,8(r18)
+ subq r6,r2,r28 C 3rd main subtract
+ ldq r4,0(r17)
+ subq r28,r25,r22 C 3rd carry subtract
+ ldq r5,8(r17)
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C 4th main subtract
+ subq r28,r25,r23 C 4th carry subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ subq r5,r1,r28 C 2nd main subtract
+ stq r23,-8(r16)
+ subq r28,r25,r21 C 2nd carry subtract
+ addq r18,32,r18 C update s2_ptr
+ cmpult r5,r1,r8 C compute cy from last subtract
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r6,r2,r28 C cy add
+ subq r28,r25,r22 C 3rd main subtract
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C cy add
+ subq r28,r25,r23 C 4th main subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: subq r4,r0,r28 C main subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r0,8(r18)
+ ldq r4,8(r17)
+ subq r28,r25,r20 C carry subtract
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: subq r4,r0,r28 C main subtract
+ subq r28,r25,r20 C carry subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/add_n.asm b/gmp/mpn/alpha/ev6/add_n.asm
index 9261f31b8a..114af73aa0 100644
--- a/gmp/mpn/alpha/ev6/add_n.asm
+++ b/gmp/mpn/alpha/ev6/add_n.asm
@@ -4,30 +4,19 @@ dnl store sum in a third limb vector.
dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev6/aorslsh1_n.asm b/gmp/mpn/alpha/ev6/aorslsh1_n.asm
deleted file mode 100644
index cb966ce021..0000000000
--- a/gmp/mpn/alpha/ev6/aorslsh1_n.asm
+++ /dev/null
@@ -1,172 +0,0 @@
-dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
-
-dnl Copyright 2003, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C EV4: ?
-C EV5: 7
-C EV6: 4
-
-C TODO
-C * Tune to reach 3.75 c/l on ev6.
-
-define(`rp',`r16')
-define(`up',`r17')
-define(`vp',`r18')
-define(`n', `r19')
-
-define(`u0', `r8')
-define(`u1', `r1')
-define(`v0', `r4')
-define(`v1', `r5')
-
-define(`cy0', `r0')
-define(`cy1', `r20')
-define(`cy', `r22')
-define(`rr', `r24')
-define(`ps', `r25')
-define(`sl', `r28')
-
-ifdef(`OPERATION_addlsh1_n',`
- define(ADDSUB, addq)
- define(CARRY, `cmpult $1,$2,$3')
- define(func, mpn_addlsh1_n)
-')
-ifdef(`OPERATION_sublsh1_n',`
- define(ADDSUB, subq)
- define(CARRY, `cmpult $2,$1,$3')
- define(func, mpn_sublsh1_n)
-')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
-
-ASM_START()
-PROLOGUE(func)
- and n, 2, cy0
- blbs n, L(bx1)
-L(bx0): ldq v1, 0(vp)
- ldq u1, 0(up)
- lda r2, 0(r31)
- bne cy0, L(b10)
-
-L(b00): lda vp, 48(vp)
- lda up, -16(up)
- lda rp, -8(rp)
- lda cy0, 0(r31)
- br r31, L(lo0)
-
-L(b10): lda vp, 32(vp)
- lda rp, 8(rp)
- lda cy0, 0(r31)
- br r31, L(lo2)
-
-L(bx1): ldq v0, 0(vp)
- ldq u0, 0(up)
- lda r3, 0(r31)
- beq cy0, L(b01)
-
-L(b11): lda vp, 40(vp)
- lda up, -24(up)
- lda rp, 16(rp)
- lda cy1, 0(r31)
- br r31, L(lo3)
-
-L(b01): lda n, -4(n)
- lda cy1, 0(r31)
- ble n, L(end)
- lda vp, 24(vp)
- lda up, -8(up)
-
- ALIGN(16)
-L(top): addq v0, v0, r6
- ldq v1, -16(vp)
- addq r6, r3, sl C combined vlimb
- ldq u1, 16(up)
- ADDSUB u0, sl, ps C ulimb + (vlimb << 1)
- cmplt v0, r31, r2 C high v bits
- ADDSUB ps, cy1, rr C consume carry from previous operation
- CARRY( ps, u0, cy0) C carry out #2
- stq rr, 0(rp)
- CARRY( rr, ps, cy) C carry out #3
- lda vp, 32(vp) C bookkeeping
- addq cy, cy0, cy0 C final carry out
-L(lo0): addq v1, v1, r7
- ldq v0, -40(vp)
- addq r7, r2, sl
- ldq u0, 24(up)
- ADDSUB u1, sl, ps
- cmplt v1, r31, r3
- ADDSUB ps, cy0, rr
- CARRY( ps, u1, cy1)
- stq rr, 8(rp)
- CARRY( rr, ps, cy)
- lda rp, 32(rp) C bookkeeping
- addq cy, cy1, cy1
-L(lo3): addq v0, v0, r6
- ldq v1, -32(vp)
- addq r6, r3, sl
- ldq u1, 32(up)
- ADDSUB u0, sl, ps
- cmplt v0, r31, r2
- ADDSUB ps, cy1, rr
- CARRY( ps, u0, cy0)
- stq rr, -16(rp)
- CARRY( rr, ps, cy)
- lda up, 32(up) C bookkeeping
- addq cy, cy0, cy0
-L(lo2): addq v1, v1, r7
- ldq v0, -24(vp)
- addq r7, r2, sl
- ldq u0, 8(up)
- ADDSUB u1, sl, ps
- cmplt v1, r31, r3
- ADDSUB ps, cy0, rr
- CARRY( ps, u1, cy1)
- stq rr, -8(rp)
- CARRY( rr, ps, cy)
- lda n, -4(n) C bookkeeping
- addq cy, cy1, cy1
- bgt n, L(top)
-
-L(end): addq v0, v0, r6
- addq r6, r3, sl
- ADDSUB u0, sl, ps
- cmplt v0, r31, r2
- ADDSUB ps, cy1, rr
- CARRY( ps, u0, cy0)
- stq rr, 0(rp)
- CARRY( rr, ps, cy)
- addq cy, cy0, cy0
- addq cy0, r2, r0
-
- ret r31,(r26),1
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/alpha/ev6/aorsmul_1.asm b/gmp/mpn/alpha/ev6/aorsmul_1.asm
index 0e68e6e7ad..eda092b2d5 100644
--- a/gmp/mpn/alpha/ev6/aorsmul_1.asm
+++ b/gmp/mpn/alpha/ev6/aorsmul_1.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
-dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
+dnl Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev6/gmp-mparam.h b/gmp/mpn/alpha/ev6/gmp-mparam.h
index e51d6b0d15..a01e977433 100644
--- a/gmp/mpn/alpha/ev6/gmp-mparam.h
+++ b/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -1,209 +1,76 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */
-/* 500 MHz 21164 (agnesi.math.su.se) */
-/* FFT tuning limit = 20000000 */
-/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
-#define USE_PREINV_DIVREM_1 1 /* preinv always */
-#define DIV_QR_1N_PI1_METHOD 2
-#define DIV_QR_1_NORM_THRESHOLD 5
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD 8
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-#define MUL_TOOM22_THRESHOLD 32
-#define MUL_TOOM33_THRESHOLD 117
-#define MUL_TOOM44_THRESHOLD 124
-#define MUL_TOOM6H_THRESHOLD 230
-#define MUL_TOOM8H_THRESHOLD 357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 59
-#define SQR_TOOM3_THRESHOLD 123
-#define SQR_TOOM4_THRESHOLD 163
-#define SQR_TOOM6_THRESHOLD 333
-#define SQR_TOOM8_THRESHOLD 0 /* always */
-
-#define MULMID_TOOM42_THRESHOLD 52
-
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 5
-
-#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \
- { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \
- { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \
- { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \
- { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \
- { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \
- { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
- { 79,11}, { 47,10}, { 103,12}, { 31,11}, \
- { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
- { 95,10}, { 199,11}, { 111,12}, { 63,11}, \
- { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
- { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
- { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
- { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
- { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 735,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
- { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
- { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 151
-#define MUL_FFT_THRESHOLD 5760
-
-#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
- { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \
- { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \
- { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
- { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \
- { 383,11}, { 111,12}, { 63,11}, { 127,10}, \
- { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
- { 671,11}, { 351,10}, { 703,11}, { 367,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
- { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
- { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
- { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
- { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \
- { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
- { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \
- { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 159
-#define SQR_FFT_THRESHOLD 5056
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 100
-#define MULLO_MUL_N_THRESHOLD 11355
-
-#define DC_DIV_QR_THRESHOLD 124
-#define DC_DIVAPPR_Q_THRESHOLD 438
-#define DC_BDIV_QR_THRESHOLD 153
-#define DC_BDIV_Q_THRESHOLD 318
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 384
-#define INV_APPR_THRESHOLD 402
-
-#define BINV_NEWTON_THRESHOLD 381
-#define REDC_1_TO_REDC_N_THRESHOLD 110
-
-#define MU_DIV_QR_THRESHOLD 1752
-#define MU_DIVAPPR_Q_THRESHOLD 1895
-#define MUPI_DIV_QR_THRESHOLD 174
-#define MU_BDIV_QR_THRESHOLD 1387
-#define MU_BDIV_Q_THRESHOLD 1787
-
-#define POWM_SEC_TABLE 1,13,66,82,579
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 318
-#define HGCD_APPR_THRESHOLD 363
-#define HGCD_REDUCE_THRESHOLD 2384
-#define GCD_DC_THRESHOLD 2504
-#define GCDEXT_DC_THRESHOLD 671
-#define JACOBI_BASE_METHOD 3
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 3754
-#define SET_STR_PRECOMPUTE_THRESHOLD 8097
-
-#define FAC_DSC_THRESHOLD 951
-#define FAC_ODD_THRESHOLD 24
+/* 500 MHz 21164 */
+
+/* Generated by tuneup.c, 2009-01-12, gcc 3.3 */
+
+#define MUL_KARATSUBA_THRESHOLD 31
+#define MUL_TOOM3_THRESHOLD 101
+#define MUL_TOOM44_THRESHOLD 168
+
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_KARATSUBA_THRESHOLD 60
+#define SQR_TOOM3_THRESHOLD 102
+#define SQR_TOOM4_THRESHOLD 172
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 102
+#define MULLOW_MUL_N_THRESHOLD 399
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
+#define DIV_DC_THRESHOLD 134
+#define POWM_THRESHOLD 257
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 303
+#define GCD_DC_THRESHOLD 1258
+#define GCDEXT_DC_THRESHOLD 807
+#define JACOBI_BASE_METHOD 3
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 13
+#define MOD_1_2_THRESHOLD 14
+#define MOD_1_4_THRESHOLD 40
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define USE_PREINV_MOD_1 1 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 4615
+#define SET_STR_PRECOMPUTE_THRESHOLD 8178
+
+#define MUL_FFT_TABLE { 432, 864, 1856, 3840, 11264, 28672, 81920, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 448
+#define MUL_FFT_THRESHOLD 4992
+
+#define SQR_FFT_TABLE { 432, 864, 1728, 3840, 9216, 20480, 81920, 327680, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 344
+#define SQR_FFT_THRESHOLD 3712
diff --git a/gmp/mpn/alpha/ev6/mod_1_4.asm b/gmp/mpn/alpha/ev6/mod_1_4.asm
deleted file mode 100644
index 836de07c0f..0000000000
--- a/gmp/mpn/alpha/ev6/mod_1_4.asm
+++ /dev/null
@@ -1,337 +0,0 @@
-dnl Alpha mpn_mod_1s_4p
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Optimise. 2.75 c/l should be possible.
-C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
-C * Optimise feed-in code, starting the sw pipeline in switch code.
-C * Shorten software pipeline. The mul instructions are scheduled too far
-C from their users. Fixing this will allow us to use fewer registers.
-C * If we cannot reduce register usage, write perhaps small-n basecase.
-C * Does this work for PIC?
-
-C cycles/limb
-C EV4: ?
-C EV5: 23
-C EV6: 3
-
-define(`ap', `r16')
-define(`n', `r17')
-define(`pl', `r24')
-define(`ph', `r25')
-define(`rl', `r6')
-define(`rh', `r7')
-define(`B1modb', `r1')
-define(`B2modb', `r2')
-define(`B3modb', `r3')
-define(`B4modb', `r4')
-define(`B5modb', `r5')
-
-ASM_START()
-PROLOGUE(mpn_mod_1s_4p)
- lda r30, -64(r30)
- stq r9, 8(r30)
- ldq B1modb, 16(r19)
- stq r10, 16(r30)
- ldq B2modb, 24(r19)
- stq r11, 24(r30)
- ldq B3modb, 32(r19)
- stq r12, 32(r30)
- ldq B4modb, 40(r19)
- stq r13, 40(r30)
- ldq B5modb, 48(r19)
- s8addq n, ap, ap C point ap at vector end
-
- and n, 3, r0
- lda n, -4(n)
- beq r0, L(b0)
- lda r6, -2(r0)
- blt r6, L(b1)
- beq r6, L(b2)
-
-L(b3): ldq r21, -16(ap)
- ldq r22, -8(ap)
- ldq r20, -24(ap)
- mulq r21, B1modb, r8
- umulh r21, B1modb, r12
- mulq r22, B2modb, r9
- umulh r22, B2modb, r13
- addq r8, r20, pl
- cmpult pl, r8, r0
- addq r0, r12, ph
- addq r9, pl, rl
- cmpult rl, r9, r0
- addq r13, ph, ph
- addq r0, ph, rh
- lda ap, -56(ap)
- br L(com)
-
-L(b0): ldq r21, -24(ap)
- ldq r22, -16(ap)
- ldq r23, -8(ap)
- ldq r20, -32(ap)
- mulq r21, B1modb, r8
- umulh r21, B1modb, r12
- mulq r22, B2modb, r9
- umulh r22, B2modb, r13
- mulq r23, B3modb, r10
- umulh r23, B3modb, r27
- addq r8, r20, pl
- cmpult pl, r8, r0
- addq r0, r12, ph
- addq r9, pl, pl
- cmpult pl, r9, r0
- addq r13, ph, ph
- addq r0, ph, ph
- addq r10, pl, rl
- cmpult rl, r10, r0
- addq r27, ph, ph
- addq r0, ph, rh
- lda ap, -64(ap)
- br L(com)
-
-L(b1): bis r31, r31, rh
- ldq rl, -8(ap)
- lda ap, -40(ap)
- br L(com)
-
-L(b2): ldq rh, -8(ap)
- ldq rl, -16(ap)
- lda ap, -48(ap)
-
-L(com): ble n, L(ed3)
- ldq r21, 8(ap)
- ldq r22, 16(ap)
- ldq r23, 24(ap)
- ldq r20, 0(ap)
- lda n, -4(n)
- lda ap, -32(ap)
- mulq r21, B1modb, r8
- umulh r21, B1modb, r12
- mulq r22, B2modb, r9
- umulh r22, B2modb, r13
- mulq r23, B3modb, r10
- umulh r23, B3modb, r27
- mulq rl, B4modb, r11
- umulh rl, B4modb, r28
- ble n, L(ed2)
-
- ALIGN(16)
-L(top): ldq r21, 8(ap)
- mulq rh, B5modb, rl
- addq r8, r20, pl
- ldq r22, 16(ap)
- cmpult pl, r8, r0
- umulh rh, B5modb, rh
- ldq r23, 24(ap)
- addq r0, r12, ph
- addq r9, pl, pl
- mulq r21, B1modb, r8
- cmpult pl, r9, r0
- addq r13, ph, ph
- umulh r21, B1modb, r12
- lda ap, -32(ap)
- addq r0, ph, ph
- addq r10, pl, pl
- mulq r22, B2modb, r9
- cmpult pl, r10, r0
- addq r27, ph, ph
- addq r11, pl, pl
- umulh r22, B2modb, r13
- addq r0, ph, ph
- cmpult pl, r11, r0
- addq r28, ph, ph
- mulq r23, B3modb, r10
- ldq r20, 32(ap)
- addq pl, rl, rl
- umulh r23, B3modb, r27
- addq r0, ph, ph
- cmpult rl, pl, r0
- mulq rl, B4modb, r11
- addq ph, rh, rh
- umulh rl, B4modb, r28
- addq r0, rh, rh
- lda n, -4(n)
- bgt n, L(top)
-
-L(ed2): mulq rh, B5modb, rl
- addq r8, r20, pl
- umulh rh, B5modb, rh
- cmpult pl, r8, r0
- addq r0, r12, ph
- addq r9, pl, pl
- cmpult pl, r9, r0
- addq r13, ph, ph
- addq r0, ph, ph
- addq r10, pl, pl
- cmpult pl, r10, r0
- addq r27, ph, ph
- addq r11, pl, pl
- addq r0, ph, ph
- cmpult pl, r11, r0
- addq r28, ph, ph
- addq pl, rl, rl
- addq r0, ph, ph
- cmpult rl, pl, r0
- addq ph, rh, rh
- addq r0, rh, rh
-
-L(ed3): mulq rh, B1modb, r8
- umulh rh, B1modb, rh
- addq r8, rl, rl
- cmpult rl, r8, r0
- addq r0, rh, rh
-
- ldq r24, 8(r19) C cnt
- sll rh, r24, rh
- subq r31, r24, r25
- srl rl, r25, r2
- sll rl, r24, rl
- or r2, rh, rh
-
- ldq r23, 0(r19) C bi
- mulq rh, r23, r8
- umulh rh, r23, r9
- addq rh, 1, r7
- addq r8, rl, r8 C ql
- cmpult r8, rl, r0
- addq r9, r7, r9
- addq r0, r9, r9 C qh
- mulq r9, r18, r21 C qh * b
- subq rl, r21, rl
- cmpult r8, rl, r0 C rl > ql
- negq r0, r0
- and r0, r18, r0
- addq rl, r0, rl
- cmpule r18, rl, r0 C rl >= b
- negq r0, r0
- and r0, r18, r0
- subq rl, r0, rl
-
- srl rl, r24, r0
-
- ldq r9, 8(r30)
- ldq r10, 16(r30)
- ldq r11, 24(r30)
- ldq r12, 32(r30)
- ldq r13, 40(r30)
- lda r30, 64(r30)
- ret r31, (r26), 1
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1s_4p_cps,gp)
- lda r30, -32(r30)
- stq r26, 0(r30)
- stq r9, 8(r30)
- stq r10, 16(r30)
- stq r11, 24(r30)
- mov r16, r11
- LEA( r4, __clz_tab)
- lda r10, 65(r31)
- cmpbge r31, r17, r1
- srl r1, 1, r1
- xor r1, 127, r1
- addq r1, r4, r1
- ldq_u r2, 0(r1)
- extbl r2, r1, r2
- s8subq r2, 7, r2
- srl r17, r2, r3
- subq r10, r2, r10
- addq r3, r4, r3
- ldq_u r1, 0(r3)
- extbl r1, r3, r1
- subq r10, r1, r10
- sll r17, r10, r9
- mov r9, r16
- jsr r26, mpn_invert_limb
- ldah r29, 0(r26)
- subq r31, r10, r2
- lda r1, 1(r31)
- sll r1, r10, r1
- subq r31, r9, r3
- srl r0, r2, r2
- ldq r26, 0(r30)
- bis r2, r1, r2
- lda r29, 0(r29)
- stq r0, 0(r11)
- stq r10, 8(r11)
- mulq r2, r3, r2
- srl r2, r10, r3
- umulh r2, r0, r1
- stq r3, 16(r11)
- mulq r2, r0, r3
- ornot r31, r1, r1
- subq r1, r2, r1
- mulq r1, r9, r1
- addq r1, r9, r2
- cmpule r1, r3, r3
- cmoveq r3, r2, r1
- srl r1, r10, r3
- umulh r1, r0, r2
- stq r3, 24(r11)
- mulq r1, r0, r3
- ornot r31, r2, r2
- subq r2, r1, r2
- mulq r2, r9, r2
- addq r2, r9, r1
- cmpule r2, r3, r3
- cmoveq r3, r1, r2
- srl r2, r10, r1
- umulh r2, r0, r3
- stq r1, 32(r11)
- mulq r2, r0, r1
- ornot r31, r3, r3
- subq r3, r2, r3
- mulq r3, r9, r3
- addq r3, r9, r2
- cmpule r3, r1, r1
- cmoveq r1, r2, r3
- srl r3, r10, r2
- umulh r3, r0, r1
- stq r2, 40(r11)
- mulq r3, r0, r0
- ornot r31, r1, r1
- subq r1, r3, r1
- mulq r1, r9, r1
- addq r1, r9, r9
- cmpule r1, r0, r0
- cmoveq r0, r9, r1
- ldq r9, 8(r30)
- srl r1, r10, r1
- ldq r10, 16(r30)
- stq r1, 48(r11)
- ldq r11, 24(r30)
- lda r30, 32(r30)
- ret r31, (r26), 1
-EPILOGUE()
diff --git a/gmp/mpn/alpha/ev6/mul_1.asm b/gmp/mpn/alpha/ev6/mul_1.asm
index 8ee19cd429..841f5083cb 100644
--- a/gmp/mpn/alpha/ev6/mul_1.asm
+++ b/gmp/mpn/alpha/ev6/mul_1.asm
@@ -4,30 +4,19 @@ dnl result in a second limb vector.
dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -60,7 +49,7 @@ C r20,r29,r13-r15 scramble
C
C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
C put-the-carry-into-hi. The idea is that these branches are very rarely
-C taken, and since a non-taken branch consumes no resources, that is better
+C taken, and since a non-taken branch consumes no resurces, that is better
C than an addq.
C
C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
@@ -137,7 +126,7 @@ $L_9_or_more:
mulq r2,r19,r3 C r3 = prod_low
umulh r2,r19,r21 C r21 = prod_high
beq r20,$Le1b C jump if size was == 1
- bis r31, r31, r0 C FIXME: shouldn't need this
+ bis r31, r31, r0 C FIXME: shouldtn't need this
ldq r2,0(r17) C r2 = s1_limb
lda r17,8(r17) C s1_ptr++
lda r20,-1(r20) C size--
diff --git a/gmp/mpn/alpha/ev6/nails/README b/gmp/mpn/alpha/ev6/nails/README
index b214ac50ad..8b3b357a77 100644
--- a/gmp/mpn/alpha/ev6/nails/README
+++ b/gmp/mpn/alpha/ev6/nails/README
@@ -2,29 +2,18 @@ Copyright 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_1.asm b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
index 711d4e66e5..149195c6f4 100644
--- a/gmp/mpn/alpha/ev6/nails/addmul_1.asm
+++ b/gmp/mpn/alpha/ev6/nails/addmul_1.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_addmul_1.
dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,7 +25,7 @@ C EV5: 18
C EV6: 4
C TODO
-C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * Reroll loop for 3.75 c/l with current 4-way unrulling.
C * The loop is overscheduled wrt loads and wrt multiplies, in particular
C umulh.
C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_2.asm b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
index 6ff6b3ad6b..9edaed8b3a 100644
--- a/gmp/mpn/alpha/ev6/nails/addmul_2.asm
+++ b/gmp/mpn/alpha/ev6/nails/addmul_2.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_addmul_2.
dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_3.asm b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
index a1ffb680ec..1d89769e13 100644
--- a/gmp/mpn/alpha/ev6/nails/addmul_3.asm
+++ b/gmp/mpn/alpha/ev6/nails/addmul_3.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_addmul_3.
dnl Copyright 2002, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev6/nails/addmul_4.asm b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
index 77e02a4316..f19b0232df 100644
--- a/gmp/mpn/alpha/ev6/nails/addmul_4.asm
+++ b/gmp/mpn/alpha/ev6/nails/addmul_4.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_addmul_4.
dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev6/nails/aors_n.asm b/gmp/mpn/alpha/ev6/nails/aors_n.asm
index f6586773f5..4958e81ed9 100644
--- a/gmp/mpn/alpha/ev6/nails/aors_n.asm
+++ b/gmp/mpn/alpha/ev6/nails/aors_n.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
dnl Copyright 2002, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
diff --git a/gmp/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
index 7949fe8df8..1bc93b52c6 100644
--- a/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
+++ b/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -1,43 +1,33 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
-#define MUL_TOOM22_THRESHOLD 40
-#define MUL_TOOM33_THRESHOLD 236
+#define MUL_KARATSUBA_THRESHOLD 40
+#define MUL_TOOM3_THRESHOLD 236
#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */
-#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */
+#define SQR_KARATSUBA_THRESHOLD 0 /* never sqr_basecase */
#define SQR_TOOM3_THRESHOLD 120
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
diff --git a/gmp/mpn/alpha/ev6/nails/mul_1.asm b/gmp/mpn/alpha/ev6/nails/mul_1.asm
index da2ee3d099..cac3776ba0 100644
--- a/gmp/mpn/alpha/ev6/nails/mul_1.asm
+++ b/gmp/mpn/alpha/ev6/nails/mul_1.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_mul_1.
dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,10 +25,10 @@ C EV5: 18
C EV6: 3.25
C TODO
-C * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C * Reroll loop for 3.0 c/l with current 4-way unrulling.
C * The loop is overscheduled wrt loads and wrt multiplies, in particular
C umulh.
-C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C * Use FP loop count and multiple exit points, that would simpily feed-in lp0
C and would work since the loop structure is really regular.
C INPUT PARAMETERS
diff --git a/gmp/mpn/alpha/ev6/nails/submul_1.asm b/gmp/mpn/alpha/ev6/nails/submul_1.asm
index f473a59ba8..4242517a4a 100644
--- a/gmp/mpn/alpha/ev6/nails/submul_1.asm
+++ b/gmp/mpn/alpha/ev6/nails/submul_1.asm
@@ -1,32 +1,21 @@
dnl Alpha ev6 nails mpn_submul_1.
dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,10 +25,10 @@ C EV5: 18
C EV6: 4
C TODO
-C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * Reroll loop for 3.75 c/l with current 4-way unrulling.
C * The loop is overscheduled wrt loads and wrt multiplies, in particular
C umulh.
-C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C * Use FP loop count and multiple exit points, that would simpily feed-in lp0
C and would work since the loop structure is really regular.
C INPUT PARAMETERS
diff --git a/gmp/mpn/alpha/ev6/slot.pl b/gmp/mpn/alpha/ev6/slot.pl
index a4c8a36882..17967e79a2 100755..100644
--- a/gmp/mpn/alpha/ev6/slot.pl
+++ b/gmp/mpn/alpha/ev6/slot.pl
@@ -1,32 +1,21 @@
#!/usr/bin/perl -w
-# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
+# Copyright 2000, 2001, 2003, 2004, 2005 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 3 of the License, or (at
+# your option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: slot.pl [filename.o]...
@@ -51,12 +40,9 @@ my %optable =
(
'addq' => 'E',
'and' => 'E',
- 'andnot' => 'E',
'beq' => 'U',
'bge' => 'U',
'bgt' => 'U',
- 'bic' => 'E',
- 'bis' => 'E',
'blt' => 'U',
'bne' => 'U',
'br' => 'L',
@@ -85,7 +71,6 @@ my %optable =
'ldt' => 'L',
'ret' => 'L',
'mov' => 'E',
- 'mull' => 'U',
'mulq' => 'U',
'negq' => 'E',
'nop' => 'E',
diff --git a/gmp/mpn/alpha/ev6/sqr_diagonal.asm b/gmp/mpn/alpha/ev6/sqr_diagonal.asm
new file mode 100644
index 0000000000..58d086e624
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/sqr_diagonal.asm
@@ -0,0 +1,115 @@
+dnl Alpha mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: ?
+C EV6: 2.3
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ lda r18, -2(r18) C n -= 2
+ ldq r0, 0(r17)
+ mulq r0, r0, r4
+ umulh r0, r0, r20
+ blt r18, L(ex1)
+ ldq r1, 8(r17)
+ mulq r1, r1, r5
+ umulh r1, r1, r21
+ beq r18, L(ex2)
+ lda r18, -2(r18) C n -= 2
+ ldq r0, 16(r17)
+ blt r18, L(ex3)
+ ldq r1, 24(r17)
+ beq r18, L(ex4)
+
+ ALIGN(16)
+L(top): lda r18, -2(r18) C n -= 2
+ stq r4, 0(r16)
+ mulq r0, r0, r4
+ stq r20, 8(r16)
+ umulh r0, r0, r20
+ ldq r0, 32(r17)
+ blt r18, L(x)
+ stq r5, 16(r16)
+ mulq r1, r1, r5
+ stq r21, 24(r16)
+ umulh r1, r1, r21
+ ldq r1, 40(r17)
+ lda r16, 32(r16) C rp += 4
+ lda r17, 16(r17) C up += 2
+ bne r18, L(top)
+
+ ALIGN(16)
+L(ex4): stq r4, 0(r16)
+ mulq r0, r0, r4
+ stq r20, 8(r16)
+ umulh r0, r0, r20
+ stq r5, 16(r16)
+ mulq r1, r1, r5
+ stq r21, 24(r16)
+ umulh r1, r1, r21
+ stq r4, 32(r16)
+ stq r20, 40(r16)
+ stq r5, 48(r16)
+ stq r21, 56(r16)
+ ret r31, (r26), 1
+ ALIGN(16)
+L(x): stq r5, 16(r16)
+ mulq r1, r1, r5
+ stq r21, 24(r16)
+ umulh r1, r1, r21
+ stq r4, 32(r16)
+ mulq r0, r0, r4
+ stq r20, 40(r16)
+ umulh r0, r0, r20
+ stq r5, 48(r16)
+ stq r21, 56(r16)
+ stq r4, 64(r16)
+ stq r20, 72(r16)
+ ret r31, (r26), 1
+L(ex1): stq r4, 0(r16)
+ stq r20, 8(r16)
+ ret r31, (r26), 1
+ ALIGN(16)
+L(ex2): stq r4, 0(r16)
+ stq r20, 8(r16)
+ stq r5, 16(r16)
+ stq r21, 24(r16)
+ ret r31, (r26), 1
+ ALIGN(16)
+L(ex3): stq r4, 0(r16)
+ mulq r0, r0, r4
+ stq r20, 8(r16)
+ umulh r0, r0, r20
+ stq r5, 16(r16)
+ stq r21, 24(r16)
+ stq r4, 32(r16)
+ stq r20, 40(r16)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/alpha/ev6/sub_n.asm b/gmp/mpn/alpha/ev6/sub_n.asm
index a35ba40d34..f23ad44a15 100644
--- a/gmp/mpn/alpha/ev6/sub_n.asm
+++ b/gmp/mpn/alpha/ev6/sub_n.asm
@@ -4,30 +4,19 @@ dnl and store difference in a third limb vector.
dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev67/gcd_1.asm b/gmp/mpn/alpha/ev67/gcd_1.asm
index 55fa7d3673..2e6f0a5e22 100644
--- a/gmp/mpn/alpha/ev67/gcd_1.asm
+++ b/gmp/mpn/alpha/ev67/gcd_1.asm
@@ -4,29 +4,18 @@ dnl Copyright 2003, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev67/hamdist.asm b/gmp/mpn/alpha/ev67/hamdist.asm
index 4b13e9f14f..a72d95e90b 100644
--- a/gmp/mpn/alpha/ev67/hamdist.asm
+++ b/gmp/mpn/alpha/ev67/hamdist.asm
@@ -4,29 +4,18 @@ dnl Copyright 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/ev67/popcount.asm b/gmp/mpn/alpha/ev67/popcount.asm
index 049c1cd239..6ed79cf158 100644
--- a/gmp/mpn/alpha/ev67/popcount.asm
+++ b/gmp/mpn/alpha/ev67/popcount.asm
@@ -4,29 +4,18 @@ dnl Copyright 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/gmp-mparam.h b/gmp/mpn/alpha/gmp-mparam.h
index b850bd24b5..6b6d7bd9c8 100644
--- a/gmp/mpn/alpha/gmp-mparam.h
+++ b/gmp/mpn/alpha/gmp-mparam.h
@@ -1,54 +1,43 @@
/* Alpha EV4 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2005, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* 175MHz 21064 */
/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */
-#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 69
+#define MUL_KARATSUBA_THRESHOLD 12
+#define MUL_TOOM3_THRESHOLD 69
#define MUL_TOOM44_THRESHOLD 88
#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 20
+#define SQR_KARATSUBA_THRESHOLD 20
#define SQR_TOOM3_THRESHOLD 62
#define SQR_TOOM4_THRESHOLD 155
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 40
-#define MULLO_MUL_N_THRESHOLD 202
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 40
+#define MULLOW_MUL_N_THRESHOLD 202
#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
#define DIV_DC_THRESHOLD 38
diff --git a/gmp/mpn/alpha/invert_limb.asm b/gmp/mpn/alpha/invert_limb.asm
index afc010f58c..99f51a30d5 100644
--- a/gmp/mpn/alpha/invert_limb.asm
+++ b/gmp/mpn/alpha/invert_limb.asm
@@ -1,95 +1,342 @@
dnl Alpha mpn_invert_limb -- Invert a normalized limb.
-dnl Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc.
-
+dnl Copyright 1996, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
+dnl Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
-C EV5: 137/140 (with BWX/without BWX)
-C EV6: 71/72 (with BWX/without BWX)
+C EV4: ~175
+C EV5: ~111-126
+C EV6: ~52-76
-C This was compiler generated, with minimal manual edits. Surely several
-C cycles could be cut with some thought.
+C This is based on ideas of Peter L. Montgomery.
ASM_START()
+
+FLOAT64($C36,9223372036854775808.0) C 2^63
+
PROLOGUE(mpn_invert_limb,gp)
- LEA( r2, approx_tab)
- srl r16, 54, r1
- srl r16, 24, r4
- and r16, 1, r5
- bic r1, 1, r7
- lda r4, 1(r4)
- srl r16, 1, r3
- addq r7, r2, r1
-ifelse(bwx_available_p,1,`
- ldwu r0, -512(r1)
-',`
- ldq_u r0, -512(r1)
- extwl r0, r7, r0
-')
- addq r3, r5, r3
- mull r0, r0, r1
- sll r0, 11, r0
- mulq r1, r4, r1
- srl r1, 40, r1
- subq r0, r1, r0
- lda r0, -1(r0)
- mulq r0, r0, r2
- sll r0, 60, r1
- sll r0, 13, r0
- mulq r2, r4, r2
- subq r1, r2, r1
- srl r1, 47, r1
- addq r0, r1, r0
- mulq r0, r3, r3
- srl r0, 1, r1
- cmoveq r5, 0, r1
- subq r1, r3, r1
- umulh r1, r0, r3
- sll r0, 31, r0
- srl r3, 1, r1
- addq r0, r1, r0
- mulq r0, r16, r2
- umulh r0, r16, r3
- addq r2, r16, r1
- addq r3, r16, r16
- cmpult r1, r2, r1
- addq r16, r1, r3
- subq r0, r3, r0
- ret r31, (r26), 1
-EPILOGUE()
-DATASTART(approx_tab,8)
-forloop(i,256,512-1,dnl
-` .word eval(0x7fd00/i)
-')dnl
- SIZE(approx_tab, 512)
- TYPE(approx_tab, object)
+ lda r30,-16(r30)
+ addq r16,r16,r1
+ bne r1,$73
+ lda r0,-1
+ br r31,$Lend
+$73:
+ srl r16,1,r1
+ stq r1,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ LEA(r1,$C36)
+ ldt f10,0(r1) C f10 = 2^63
+ divt f10,f1,f10 C f10 = 2^63 / (u / 2)
+ LEA(r2,$invtab-4096)
+ srl r16,52,r1 C extract high 12 bits
+ addq r1,r1,r1 C align ...0000bbbbbbbb0
+ addq r1,r2,r1 C compute array offset
+ ldq_u r2,0(r1) C load quadword containing our 16 bits
+bigend(`addq r1,1,r1')
+ extwl r2,r1,r2 C extract desired 16 bits
+ sll r2,48,r0
+ umulh r16,r0,r1
+ addq r16,r1,r3
+ stq r3,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ mult f1,f10,f1
+ cvttqc f1,f1
+ stt f1,0(r30)
+ ldq r4,0(r30)
+ subq r0,r4,r0
+ umulh r16,r0,r1
+ mulq r16,r0,r2
+ addq r16,r1,r3
+ bge r3,$Loop2
+$Loop1: addq r2,r16,r2
+ cmpult r2,r16,r1
+ addq r3,r1,r3
+ addq r0,1,r0
+ blt r3,$Loop1
+$Loop2: cmpult r2,r16,r1
+ subq r0,1,r0
+ subq r3,r1,r3
+ subq r2,r16,r2
+ bge r3,$Loop2
+$Lend:
+ lda r30,16(r30)
+ ret r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART($invtab)
+ .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+ .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+ .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+ .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+ .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+ .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+ .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+ .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+ .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+ .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+ .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+ .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+ .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+ .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+ .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+ .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+ .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+ .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+ .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+ .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+ .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+ .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+ .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+ .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+ .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+ .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+ .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+ .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+ .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+ .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+ .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+ .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+ .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+ .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+ .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+ .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+ .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+ .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+ .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+ .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+ .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+ .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+ .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+ .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+ .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+ .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+ .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+ .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+ .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+ .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+ .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+ .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+ .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+ .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+ .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+ .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+ .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+ .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+ .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+ .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+ .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+ .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+ .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+ .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+ .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+ .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+ .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+ .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+ .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+ .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+ .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+ .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+ .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+ .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+ .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+ .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+ .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+ .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+ .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+ .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+ .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+ .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+ .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+ .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+ .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+ .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+ .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+ .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+ .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+ .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+ .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+ .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+ .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+ .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+ .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+ .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+ .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+ .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+ .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+ .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+ .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+ .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+ .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+ .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+ .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+ .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+ .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+ .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+ .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+ .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+ .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+ .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+ .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+ .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+ .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+ .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+ .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+ .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+ .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+ .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+ .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+ .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+ .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+ .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+ .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+ .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+ .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+ .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+ .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+ .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+ .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+ .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+ .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+ .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+ .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+ .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+ .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+ .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+ .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+ .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+ .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+ .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+ .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+ .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+ .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+ .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+ .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+ .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+ .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+ .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+ .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+ .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+ .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+ .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+ .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+ .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+ .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+ .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+ .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+ .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+ .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+ .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+ .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+ .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+ .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+ .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+ .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+ .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+ .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+ .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+ .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+ .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+ .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+ .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+ .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+ .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+ .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+ .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+ .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+ .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+ .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+ .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+ .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+ .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+ .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+ .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+ .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+ .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+ .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+ .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+ .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+ .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+ .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+ .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+ .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+ .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+ .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+ .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+ .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+ .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+ .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+ .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+ .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+ .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+ .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+ .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+ .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+ .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+ .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+ .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+ .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+ .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+ .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+ .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+ .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+ .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+ .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+ .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+ .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+ .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+ .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+ .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+ .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+ .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+ .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+ .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+ .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+ .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+ .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+ .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+ .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+ .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+ .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+ .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+ .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+ .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+ .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+ .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+ .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+ .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+ .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+ .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+ .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+ .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+ .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+ .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+ .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+ .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+ .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+ .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+ .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+ .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+ .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+ .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+ .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+ .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
DATAEND()
ASM_END()
diff --git a/gmp/mpn/alpha/lshift.asm b/gmp/mpn/alpha/lshift.asm
index c62a856aea..eb5b2a0b68 100644
--- a/gmp/mpn/alpha/lshift.asm
+++ b/gmp/mpn/alpha/lshift.asm
@@ -1,39 +1,28 @@
dnl Alpha mpn_lshift -- Shift a number left.
-dnl Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc.
+dnl Copyright 1994, 1995, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
-C EV5: 3.25
-C EV6: 1.75
+C EV4: 4.75
+C EV5: 4
+C EV6: 2
C INPUT PARAMETERS
C rp r16
@@ -46,137 +35,63 @@ ASM_START()
PROLOGUE(mpn_lshift)
s8addq r18,r17,r17 C make r17 point at end of s1
ldq r4,-8(r17) C load first limb
- subq r31,r19,r20
+ subq r17,8,r17
+ subq r31,r19,r7
s8addq r18,r16,r16 C make r16 point at end of RES
subq r18,1,r18
- and r18,4-1,r28 C number of limbs in first loop
- srl r4,r20,r0 C compute function result
+ and r18,4-1,r20 C number of limbs in first loop
+ srl r4,r7,r0 C compute function result
- beq r28,L(L0)
- subq r18,r28,r18
+ beq r20,$L0
+ subq r18,r20,r18
ALIGN(8)
-L(top0):
- ldq r3,-16(r17)
+$Loop0: ldq r3,-8(r17)
subq r16,8,r16
- sll r4,r19,r5
subq r17,8,r17
- subq r28,1,r28
- srl r3,r20,r6
+ subq r20,1,r20
+ sll r4,r19,r5
+ srl r3,r7,r6
bis r3,r3,r4
bis r5,r6,r8
stq r8,0(r16)
- bne r28,L(top0)
+ bne r20,$Loop0
-L(L0): sll r4,r19,r24
- beq r18,L(end)
-C warm up phase 1
- ldq r1,-16(r17)
- subq r18,4,r18
- ldq r2,-24(r17)
- ldq r3,-32(r17)
- ldq r4,-40(r17)
-C warm up phase 2
- srl r1,r20,r7
- sll r1,r19,r21
- srl r2,r20,r8
- beq r18,L(end1)
- ldq r1,-48(r17)
- sll r2,r19,r22
- ldq r2,-56(r17)
- srl r3,r20,r5
- bis r7,r24,r7
- sll r3,r19,r23
- bis r8,r21,r8
- srl r4,r20,r6
- ldq r3,-64(r17)
- sll r4,r19,r24
- ldq r4,-72(r17)
- subq r18,4,r18
- beq r18,L(end2)
- ALIGN(16)
-C main loop
-L(top): stq r7,-8(r16)
- bis r5,r22,r5
- stq r8,-16(r16)
- bis r6,r23,r6
-
- srl r1,r20,r7
- subq r18,4,r18
- sll r1,r19,r21
- unop C ldq r31,-96(r17)
-
- srl r2,r20,r8
- ldq r1,-80(r17)
- sll r2,r19,r22
- ldq r2,-88(r17)
-
- stq r5,-24(r16)
- bis r7,r24,r7
- stq r6,-32(r16)
- bis r8,r21,r8
-
- srl r3,r20,r5
- unop C ldq r31,-96(r17)
- sll r3,r19,r23
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,-8(r17)
subq r16,32,r16
+ subq r18,4,r18
+ sll r4,r19,r5
+ srl r3,r7,r6
+
+ ldq r4,-16(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,24(r16)
+ srl r4,r7,r2
+
+ ldq r3,-24(r17)
+ sll r4,r19,r5
+ bis r1,r2,r8
+ stq r8,16(r16)
+ srl r3,r7,r6
- srl r4,r20,r6
- ldq r3,-96(r17)
- sll r4,r19,r24
- ldq r4,-104(r17)
+ ldq r4,-32(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,8(r16)
+ srl r4,r7,r2
subq r17,32,r17
- bne r18,L(top)
-C cool down phase 2/1
-L(end2):
- stq r7,-8(r16)
- bis r5,r22,r5
- stq r8,-16(r16)
- bis r6,r23,r6
- srl r1,r20,r7
- sll r1,r19,r21
- srl r2,r20,r8
- sll r2,r19,r22
- stq r5,-24(r16)
- bis r7,r24,r7
- stq r6,-32(r16)
- bis r8,r21,r8
- srl r3,r20,r5
- sll r3,r19,r23
- srl r4,r20,r6
- sll r4,r19,r24
-C cool down phase 2/2
- stq r7,-40(r16)
- bis r5,r22,r5
- stq r8,-48(r16)
- bis r6,r23,r6
- stq r5,-56(r16)
- stq r6,-64(r16)
-C cool down phase 2/3
- stq r24,-72(r16)
- ret r31,(r26),1
+ bis r1,r2,r8
+ stq r8,0(r16)
-C cool down phase 1/1
-L(end1):
- sll r2,r19,r22
- srl r3,r20,r5
- bis r7,r24,r7
- sll r3,r19,r23
- bis r8,r21,r8
- srl r4,r20,r6
- sll r4,r19,r24
-C cool down phase 1/2
- stq r7,-8(r16)
- bis r5,r22,r5
- stq r8,-16(r16)
- bis r6,r23,r6
- stq r5,-24(r16)
- stq r6,-32(r16)
- stq r24,-40(r16)
- ret r31,(r26),1
+ bgt r18,$Loop
-L(end): stq r24,-8(r16)
+$Lend: sll r4,r19,r8
+ stq r8,-8(r16)
ret r31,(r26),1
EPILOGUE(mpn_lshift)
ASM_END()
diff --git a/gmp/mpn/alpha/mod_34lsub1.asm b/gmp/mpn/alpha/mod_34lsub1.asm
index 1b03b637d8..e5c1d221f9 100644
--- a/gmp/mpn/alpha/mod_34lsub1.asm
+++ b/gmp/mpn/alpha/mod_34lsub1.asm
@@ -3,30 +3,19 @@ dnl Alpha mpn_mod_34lsub1.
dnl Copyright 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/mode1o.asm b/gmp/mpn/alpha/mode1o.asm
index 96dccc73ee..0611cd8acb 100644
--- a/gmp/mpn/alpha/mode1o.asm
+++ b/gmp/mpn/alpha/mode1o.asm
@@ -1,32 +1,21 @@
dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
dnl Copyright 2003, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/mul_1.asm b/gmp/mpn/alpha/mul_1.asm
index a7cdbcf8eb..30b17021ba 100644
--- a/gmp/mpn/alpha/mul_1.asm
+++ b/gmp/mpn/alpha/mul_1.asm
@@ -4,30 +4,19 @@ dnl the result in a second limb vector.
dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/rshift.asm b/gmp/mpn/alpha/rshift.asm
index 6e1e214558..ccedff8071 100644
--- a/gmp/mpn/alpha/rshift.asm
+++ b/gmp/mpn/alpha/rshift.asm
@@ -1,39 +1,28 @@
dnl Alpha mpn_rshift -- Shift a number right.
-dnl Copyright 1994, 1995, 2000, 2009 Free Software Foundation, Inc.
+dnl Copyright 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
-C EV5: 3.25
-C EV6: 1.75
+C EV4: 4.75
+C EV5: 3.75
+C EV6: 2
C INPUT PARAMETERS
C rp r16
@@ -45,136 +34,62 @@ C cnt r19
ASM_START()
PROLOGUE(mpn_rshift)
ldq r4,0(r17) C load first limb
- subq r31,r19,r20
+ addq r17,8,r17
+ subq r31,r19,r7
subq r18,1,r18
- and r18,4-1,r28 C number of limbs in first loop
- sll r4,r20,r0 C compute function result
+ and r18,4-1,r20 C number of limbs in first loop
+ sll r4,r7,r0 C compute function result
- beq r28,L(L0)
- subq r18,r28,r18
+ beq r20,$L0
+ subq r18,r20,r18
ALIGN(8)
-L(top0):
- ldq r3,8(r17)
+$Loop0: ldq r3,0(r17)
addq r16,8,r16
- srl r4,r19,r5
addq r17,8,r17
- subq r28,1,r28
- sll r3,r20,r6
+ subq r20,1,r20
+ srl r4,r19,r5
+ sll r3,r7,r6
bis r3,r3,r4
bis r5,r6,r8
stq r8,-8(r16)
- bne r28,L(top0)
+ bne r20,$Loop0
-L(L0): srl r4,r19,r24
- beq r18,L(end)
-C warm up phase 1
- ldq r1,8(r17)
- subq r18,4,r18
- ldq r2,16(r17)
- ldq r3,24(r17)
- ldq r4,32(r17)
-C warm up phase 2
- sll r1,r20,r7
- srl r1,r19,r21
- sll r2,r20,r8
- beq r18,L(end1)
- ldq r1,40(r17)
- srl r2,r19,r22
- ldq r2,48(r17)
- sll r3,r20,r5
- bis r7,r24,r7
- srl r3,r19,r23
- bis r8,r21,r8
- sll r4,r20,r6
- ldq r3,56(r17)
- srl r4,r19,r24
- ldq r4,64(r17)
- subq r18,4,r18
- beq r18,L(end2)
- ALIGN(16)
-C main loop
-L(top): stq r7,0(r16)
- bis r5,r22,r5
- stq r8,8(r16)
- bis r6,r23,r6
-
- sll r1,r20,r7
- subq r18,4,r18
- srl r1,r19,r21
- unop C ldq r31,-96(r17)
-
- sll r2,r20,r8
- ldq r1,72(r17)
- srl r2,r19,r22
- ldq r2,80(r17)
-
- stq r5,16(r16)
- bis r7,r24,r7
- stq r6,24(r16)
- bis r8,r21,r8
-
- sll r3,r20,r5
- unop C ldq r31,-96(r17)
- srl r3,r19,r23
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,0(r17)
addq r16,32,r16
+ subq r18,4,r18
+ srl r4,r19,r5
+ sll r3,r7,r6
+
+ ldq r4,8(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-32(r16)
+ sll r4,r7,r2
+
+ ldq r3,16(r17)
+ srl r4,r19,r5
+ bis r1,r2,r8
+ stq r8,-24(r16)
+ sll r3,r7,r6
- sll r4,r20,r6
- ldq r3,88(r17)
- srl r4,r19,r24
- ldq r4,96(r17)
+ ldq r4,24(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-16(r16)
+ sll r4,r7,r2
addq r17,32,r17
- bne r18,L(top)
-C cool down phase 2/1
-L(end2):
- stq r7,0(r16)
- bis r5,r22,r5
- stq r8,8(r16)
- bis r6,r23,r6
- sll r1,r20,r7
- srl r1,r19,r21
- sll r2,r20,r8
- srl r2,r19,r22
- stq r5,16(r16)
- bis r7,r24,r7
- stq r6,24(r16)
- bis r8,r21,r8
- sll r3,r20,r5
- srl r3,r19,r23
- sll r4,r20,r6
- srl r4,r19,r24
-C cool down phase 2/2
- stq r7,32(r16)
- bis r5,r22,r5
- stq r8,40(r16)
- bis r6,r23,r6
- stq r5,48(r16)
- stq r6,56(r16)
-C cool down phase 2/3
- stq r24,64(r16)
- ret r31,(r26),1
+ bis r1,r2,r8
+ stq r8,-8(r16)
-C cool down phase 1/1
-L(end1):
- srl r2,r19,r22
- sll r3,r20,r5
- bis r7,r24,r7
- srl r3,r19,r23
- bis r8,r21,r8
- sll r4,r20,r6
- srl r4,r19,r24
-C cool down phase 1/2
- stq r7,0(r16)
- bis r5,r22,r5
- stq r8,8(r16)
- bis r6,r23,r6
- stq r5,16(r16)
- stq r6,24(r16)
- stq r24,32(r16)
- ret r31,(r26),1
+ bgt r18,$Loop
-L(end): stq r24,0(r16)
+$Lend: srl r4,r19,r8
+ stq r8,0(r16)
ret r31,(r26),1
EPILOGUE(mpn_rshift)
ASM_END()
diff --git a/gmp/mpn/alpha/sec_tabselect.asm b/gmp/mpn/alpha/sec_tabselect.asm
deleted file mode 100644
index 679b16926e..0000000000
--- a/gmp/mpn/alpha/sec_tabselect.asm
+++ /dev/null
@@ -1,137 +0,0 @@
-dnl Alpha mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C EV4: ?
-C EV5: 2.25
-C EV6: 1.64
-
-define(`rp', `r16')
-define(`tp', `r17')
-define(`n', `r18')
-define(`nents', `r19')
-define(`which', `r20')
-
-define(`i', `r21')
-define(`j', `r22')
-define(`stride', `r23')
-define(`mask', `r24')
-define(`k', `r25')
-
-
-ASM_START()
-PROLOGUE(mpn_sec_tabselect)
- subq n, 4, j C outer loop induction variable
-
- blt j, L(outer_end)
-L(outer_top):
- mov tp, r8
- lda r0, 0(r31)
- lda r1, 0(r31)
- lda r2, 0(r31)
- lda r3, 0(r31)
- subq j, 4, j C outer loop induction variable
- subq nents, which, k
- mov nents, i
-
- ALIGN(16)
-L(top): ldq r4, 0(tp)
- ldq r5, 8(tp)
- cmpeq k, i, mask
- subq i, 1, i
- subq r31, mask, mask
- ldq r6, 16(tp)
- ldq r7, 24(tp)
- and r4, mask, r4
- and r5, mask, r5
- or r0, r4, r0
- or r1, r5, r1
- and r6, mask, r6
- and r7, mask, r7
- or r2, r6, r2
- or r3, r7, r3
- s8addq n, tp, tp
- bne i, L(top)
-
- stq r0, 0(rp)
- stq r1, 8(rp)
- stq r2, 16(rp)
- stq r3, 24(rp)
- addq r8, 32, tp
- addq rp, 32, rp
- bge j, L(outer_top)
-L(outer_end):
-
- and n, 2, r0
- beq r0, L(b0x)
-L(b1x): mov tp, r8
- lda r0, 0(r31)
- lda r1, 0(r31)
- subq nents, which, k
- mov nents, i
- ALIGN(16)
-L(tp2): ldq r4, 0(tp)
- ldq r5, 8(tp)
- cmpeq k, i, mask
- subq i, 1, i
- subq r31, mask, mask
- and r4, mask, r4
- and r5, mask, r5
- or r0, r4, r0
- or r1, r5, r1
- s8addq n, tp, tp
- bne i, L(tp2)
- stq r0, 0(rp)
- stq r1, 8(rp)
- addq r8, 16, tp
- addq rp, 16, rp
-
-L(b0x): and n, 1, r0
- beq r0, L(b00)
-L(b01): lda r0, 0(r31)
- subq nents, which, k
- mov nents, i
- ALIGN(16)
-L(tp1): ldq r4, 0(tp)
- cmpeq k, i, mask
- subq i, 1, i
- subq r31, mask, mask
- and r4, mask, r4
- or r0, r4, r0
- s8addq n, tp, tp
- bne i, L(tp1)
- stq r0, 0(rp)
-
-L(b00): ret r31, (r26), 1
-EPILOGUE()
diff --git a/gmp/mpn/alpha/sqr_diag_addlsh1.asm b/gmp/mpn/alpha/sqr_diag_addlsh1.asm
deleted file mode 100644
index ee219ef7e8..0000000000
--- a/gmp/mpn/alpha/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,93 +0,0 @@
-dnl Alpha mpn_sqr_diag_addlsh1.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C EV4: ?
-C EV5: 10.2
-C EV6: 4.5
-
-C Ideally, one-way code could run at 9 c/l (limited by mulq+umulh) on ev5 and
-C about 3.75 c/l on ev6. Two-way code could run at about 3.25 c/l on ev6.
-
-C Algorithm: We allow ourselves to propagate carry to a product high word
-C without worrying for carry out, since (B-1)^2 = B^2-2B+1 has a high word of
-C B-2, i.e, will not spill. We propagate carry similarly to a product low word
-C since the problem value B-1 is a quadratic non-residue mod B, but our
-C products are squares.
-
-define(`rp', `r16')
-define(`tp', `r17')
-define(`up', `r18')
-define(`n', `r19')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
- ldq r0, 0(up)
- bis r31, r31, r21
- bis r31, r31, r3
- mulq r0, r0, r7
- stq r7, 0(rp)
- umulh r0, r0, r6
- lda n, -1(n)
-
- ALIGN(16)
-L(top): ldq r0, 8(up)
- lda up, 8(up)
- ldq r8, 0(tp)
- ldq r20, 8(tp)
- mulq r0, r0, r7
- lda tp, 16(tp)
- sll r8, 1, r23
- srl r8, 63, r22
- or r21, r23, r23
- sll r20, 1, r24
- addq r3, r6, r6 C cannot carry per comment above
- or r22, r24, r24
- addq r23, r6, r21
- umulh r0, r0, r6
- cmpult r21, r23, r1
- addq r1, r7, r7 C cannot carry per comment above
- stq r21, 8(rp)
- addq r24, r7, r22
- stq r22, 16(rp)
- lda n, -1(n)
- cmpult r22, r7, r3
- srl r20, 63, r21
- lda rp, 16(rp)
- bne n, L(top)
-
- addq r3, r6, r6 C cannot carry per comment above
- addq r21, r6, r21
- stq r21, 8(rp)
- ret r31, (r26), 1
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/alpha/sqr_diagonal.asm b/gmp/mpn/alpha/sqr_diagonal.asm
new file mode 100644
index 0000000000..2aa7f2e597
--- /dev/null
+++ b/gmp/mpn/alpha/sqr_diagonal.asm
@@ -0,0 +1,65 @@
+dnl Alpha mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.45
+
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C n r18
+
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ ldq r2,0(r17) C r2 = s1_limb
+ lda r18,-2(r18) C size -= 2
+ mulq r2,r2,r3 C r3 = prod_low
+ umulh r2,r2,r4 C r4 = prod_high
+ blt r18,$Lend1 C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: stq r3,0(r16)
+ mulq r2,r2,r3 C r3 = prod_low
+ lda r18,-1(r18) C size--
+ stq r4,8(r16)
+ umulh r2,r2,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ lda r17,8(r17) C s1_ptr++
+ lda r16,16(r16) C res_ptr++
+ bne r18,$Loop
+
+$Lend2: stq r3,0(r16)
+ mulq r2,r2,r3 C r3 = prod_low
+ stq r4,8(r16)
+ umulh r2,r2,r4 C r4 = cy_limb
+ stq r3,16(r16)
+ stq r4,24(r16)
+ ret r31,(r26),1
+$Lend1: stq r3,0(r16)
+ stq r4,8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_sqr_diagonal)
+ASM_END()
diff --git a/gmp/mpn/alpha/sub_n.asm b/gmp/mpn/alpha/sub_n.asm
index 1bb72263f8..842a4f0b54 100644
--- a/gmp/mpn/alpha/sub_n.asm
+++ b/gmp/mpn/alpha/sub_n.asm
@@ -1,164 +1,117 @@
-dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0
-dnl and store difference in a third limb vector.
+dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
-dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2002, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C EV4: ?
-C EV5: 4.75
-C EV6: 3
+C EV4: 7.75
+C EV5: 5.75
+C EV6: 4
-dnl INPUT PARAMETERS
-dnl res_ptr r16
-dnl s1_ptr r17
-dnl s2_ptr r18
-dnl size r19
+C INPUT PARAMETERS
+C rp r16
+C up r17
+C vp r18
+C n r19
ASM_START()
-PROLOGUE(mpn_sub_nc)
- bis r31,r20,r25
- br L(com)
-EPILOGUE()
PROLOGUE(mpn_sub_n)
- bis r31,r31,r25 C clear cy
-L(com): subq r19,4,r19 C decr loop cnt
- blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
-C Start software pipeline for 1st loop
- ldq r0,0(r18)
- ldq r4,0(r17)
- ldq r1,8(r18)
- ldq r5,8(r17)
- addq r17,32,r17 C update s1_ptr
- subq r4,r0,r28 C 1st main subtract
- ldq r2,16(r18)
- subq r28,r25,r20 C 1st carry subtract
- ldq r3,24(r18)
- cmpult r4,r0,r8 C compute cy from last subtract
- ldq r6,-16(r17)
- cmpult r28,r25,r25 C compute cy from last subtract
- ldq r7,-8(r17)
- bis r8,r25,r25 C combine cy from the two subtracts
- subq r19,4,r19 C decr loop cnt
- subq r5,r1,r28 C 2nd main subtract
- addq r18,32,r18 C update s2_ptr
- subq r28,r25,r21 C 2nd carry subtract
- cmpult r5,r1,r8 C compute cy from last subtract
- blt r19,$Lend1 C if less than 4 limbs remain, jump
-C 1st loop handles groups of 4 limbs in a software pipeline
- ALIGN(16)
-$Loop: cmpult r28,r25,r25 C compute cy from last subtract
- ldq r0,0(r18)
- bis r8,r25,r25 C combine cy from the two subtracts
- ldq r1,8(r18)
- subq r6,r2,r28 C 3rd main subtract
- ldq r4,0(r17)
- subq r28,r25,r22 C 3rd carry subtract
- ldq r5,8(r17)
- cmpult r6,r2,r8 C compute cy from last subtract
- cmpult r28,r25,r25 C compute cy from last subtract
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two subtracts
- stq r21,8(r16)
- subq r7,r3,r28 C 4th main subtract
- subq r28,r25,r23 C 4th carry subtract
- cmpult r7,r3,r8 C compute cy from last subtract
- cmpult r28,r25,r25 C compute cy from last subtract
- addq r17,32,r17 C update s1_ptr
- bis r8,r25,r25 C combine cy from the two subtracts
- addq r16,32,r16 C update res_ptr
- subq r4,r0,r28 C 1st main subtract
- ldq r2,16(r18)
- subq r28,r25,r20 C 1st carry subtract
- ldq r3,24(r18)
- cmpult r4,r0,r8 C compute cy from last subtract
- ldq r6,-16(r17)
- cmpult r28,r25,r25 C compute cy from last subtract
- ldq r7,-8(r17)
- bis r8,r25,r25 C combine cy from the two subtracts
- subq r19,4,r19 C decr loop cnt
- stq r22,-16(r16)
- subq r5,r1,r28 C 2nd main subtract
- stq r23,-8(r16)
- subq r28,r25,r21 C 2nd carry subtract
- addq r18,32,r18 C update s2_ptr
- cmpult r5,r1,r8 C compute cy from last subtract
- bge r19,$Loop
-C Finish software pipeline for 1st loop
-$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
- bis r8,r25,r25 C combine cy from the two subtracts
- subq r6,r2,r28 C cy add
- subq r28,r25,r22 C 3rd main subtract
- cmpult r6,r2,r8 C compute cy from last subtract
- cmpult r28,r25,r25 C compute cy from last subtract
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two subtracts
- stq r21,8(r16)
- subq r7,r3,r28 C cy add
- subq r28,r25,r23 C 4th main subtract
- cmpult r7,r3,r8 C compute cy from last subtract
- cmpult r28,r25,r25 C compute cy from last subtract
- bis r8,r25,r25 C combine cy from the two subtracts
- addq r16,32,r16 C update res_ptr
- stq r22,-16(r16)
- stq r23,-8(r16)
-$Lend2: addq r19,4,r19 C restore loop cnt
- beq r19,$Lret
-C Start software pipeline for 2nd loop
- ldq r0,0(r18)
- ldq r4,0(r17)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
subq r19,1,r19
- beq r19,$Lend0
-C 2nd loop handles remaining 1-3 limbs
- ALIGN(16)
-$Loop0: subq r4,r0,r28 C main subtract
- cmpult r4,r0,r8 C compute cy from last subtract
- ldq r0,8(r18)
- ldq r4,8(r17)
- subq r28,r25,r20 C carry subtract
- addq r18,8,r18
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
addq r17,8,r17
- stq r20,0(r16)
- cmpult r28,r25,r25 C compute cy from last subtract
- subq r19,1,r19 C decr loop cnt
- bis r8,r25,r25 C combine cy from the two subtracts
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
addq r16,8,r16
- bne r19,$Loop0
-$Lend0: subq r4,r0,r28 C main subtract
- subq r28,r25,r20 C carry subtract
- cmpult r4,r0,r8 C compute cy from last subtract
- cmpult r28,r25,r25 C compute cy from last subtract
- stq r20,0(r16)
- bis r8,r25,r25 C combine cy from the two subtracts
-
-$Lret: bis r25,r31,r0 C return cy
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
ret r31,(r26),1
-EPILOGUE()
+EPILOGUE(mpn_sub_n)
ASM_END()
diff --git a/gmp/mpn/alpha/submul_1.asm b/gmp/mpn/alpha/submul_1.asm
index 2b63b52fa4..554ccf51b6 100644
--- a/gmp/mpn/alpha/submul_1.asm
+++ b/gmp/mpn/alpha/submul_1.asm
@@ -4,30 +4,19 @@ dnl the result from a second limb vector.
dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/umul.asm b/gmp/mpn/alpha/umul.asm
index 039081ed48..7fa3f008f1 100644
--- a/gmp/mpn/alpha/umul.asm
+++ b/gmp/mpn/alpha/umul.asm
@@ -3,30 +3,19 @@ dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/alpha/unicos.m4 b/gmp/mpn/alpha/unicos.m4
index e05cf5cca6..f1f41c18e4 100644
--- a/gmp/mpn/alpha/unicos.m4
+++ b/gmp/mpn/alpha/unicos.m4
@@ -3,33 +3,22 @@ divert(-1)
dnl m4 macros for alpha assembler on unicos.
-dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Note that none of the standard GMP_ASM_ autoconf tests are done for
@@ -86,9 +75,8 @@ m4_assert_numargs(1)
` .extern $1')
define(`DATASTART',
-m4_assert_numargs_range(1,2)
+m4_assert_numargs(1)
` .psect $1@crud,data
- ALIGN(ifelse($#,1,2,$2))
$1:')
define(`DATAEND',
diff --git a/gmp/mpn/arm/README b/gmp/mpn/arm/README
deleted file mode 100644
index 598baa3f2e..0000000000
--- a/gmp/mpn/arm/README
+++ /dev/null
@@ -1,35 +0,0 @@
-Copyright 2002, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
-
-
-
-
-
-This directory contains mpn functions for ARM processors. It has been
-optimised for Cortex-A9, but the code in the top-level directory should run
-on all ARM processors at architecture level v4 or later.
diff --git a/gmp/mpn/arm/add_n.asm b/gmp/mpn/arm/add_n.asm
new file mode 100644
index 0000000000..0f0791759d
--- /dev/null
+++ b/gmp/mpn/arm/add_n.asm
@@ -0,0 +1,69 @@
+dnl ARM mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
+dnl in a third limb vector.
+dnl Contributed by Robert Harley.
+
+dnl Copyright 1997, 2000, 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This code runs at 5 cycles/limb.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`vp',`r2')
+define(`n',`r3')
+
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ stmfd sp!, { r8, r9, lr }
+ movs n, n, lsr #1
+ bcc L(skip1)
+ ldr r12, [up], #4
+ ldr lr, [vp], #4
+ adds r12, r12, lr
+ str r12, [rp], #4
+L(skip1):
+ tst n, #1
+ beq L(skip2)
+ ldmia up!, { r8, r9 }
+ ldmia vp!, { r12, lr }
+ adcs r8, r8, r12
+ adcs r9, r9, lr
+ stmia rp!, { r8, r9 }
+L(skip2):
+ bics n, n, #1
+ beq L(return)
+ stmfd sp!, { r4, r5, r6, r7 }
+L(add_n_loop):
+ ldmia up!, { r4, r5, r6, r7 }
+ ldmia vp!, { r8, r9, r12, lr }
+ adcs r4, r4, r8
+ ldr r8, [rp, #12] C cache allocate
+ adcs r5, r5, r9
+ adcs r6, r6, r12
+ adcs r7, r7, lr
+ stmia rp!, { r4, r5, r6, r7 }
+ sub n, n, #2
+ teq n, #0
+ bne L(add_n_loop)
+ ldmfd sp!, { r4, r5, r6, r7 }
+L(return):
+ adc r0, n, #0
+ ldmfd sp!, { r8, r9, pc }
+EPILOGUE(mpn_add_n)
diff --git a/gmp/mpn/arm/addmul_1.asm b/gmp/mpn/arm/addmul_1.asm
new file mode 100644
index 0000000000..de33f2f34b
--- /dev/null
+++ b/gmp/mpn/arm/addmul_1.asm
@@ -0,0 +1,107 @@
+dnl ARM mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
+dnl to a second limb vector.
+
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: 7.75-9.75 (dependent on vl value)
+C XScale: 8-9 (dependent on vl value, estimated)
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
+define(`vl',`r3')
+define(`rl',`r12')
+define(`ul',`r6')
+define(`r',`lr')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ stmfd sp!, { r4-r6, lr }
+ mov r4, #0 C clear r4
+ adds r0, r0, #0 C clear cy
+ tst n, #1
+ beq L(skip1)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ umull r5, r4, ul, vl
+ adds r, rl, r5
+ str r, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ ldr ul, [up], #4
+ adcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ adcs r, rl, r5
+ str r, [rp], #4
+L(skip2):
+ bics r, n, #3
+ beq L(return)
+
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ b L(in)
+
+L(loop):
+ ldr ul, [up], #4
+ adcs r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+L(in): ldr ul, [up], #4
+ adcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ adcs r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ adcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ sub n, n, #4
+ bics r, n, #3
+ bne L(loop)
+
+ adcs r, rl, r5
+ str r, [rp], #4
+L(return):
+ adc r0, r4, #0
+ ldmfd sp!, { r4-r6, pc }
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp/mpn/arm/aors_n.asm b/gmp/mpn/arm/aors_n.asm
deleted file mode 100644
index fdad9f7ba6..0000000000
--- a/gmp/mpn/arm/aors_n.asm
+++ /dev/null
@@ -1,112 +0,0 @@
-dnl ARM mpn_add_n and mpn_sub_n
-
-dnl Contributed to the GNU project by Robert Harley.
-
-dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.5 slightly fluctuating
-C Cortex-A15 2.25
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`OPERATION_add_n', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`CLRCY', `cmn r0, #0')
- define(`SETCY', `cmp $1, #1')
- define(`RETVAL', `adc r0, n, #0')
- define(`func', mpn_add_n)
- define(`func_nc', mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`CLRCY', `cmp r0, r0')
- define(`SETCY', `rsbs $1, $1, #0')
- define(`RETVAL', `sbc r0, r0, r0
- and r0, r0, #1')
- define(`func', mpn_sub_n)
- define(`func_nc', mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ASM_START()
-PROLOGUE(func_nc)
- ldr r12, [sp, #0]
- stmfd sp!, { r8, r9, lr }
- SETCY( r12)
- b L(ent)
-EPILOGUE()
-PROLOGUE(func)
- stmfd sp!, { r8, r9, lr }
- CLRCY( r12)
-L(ent): tst n, #1
- beq L(skip1)
- ldr r12, [up], #4
- ldr lr, [vp], #4
- ADDSUBC r12, r12, lr
- str r12, [rp], #4
-L(skip1):
- tst n, #2
- beq L(skip2)
- ldmia up!, { r8, r9 }
- ldmia vp!, { r12, lr }
- ADDSUBC r8, r8, r12
- ADDSUBC r9, r9, lr
- stmia rp!, { r8, r9 }
-L(skip2):
- bics n, n, #3
- beq L(rtn)
- stmfd sp!, { r4, r5, r6, r7 }
-
-L(top): ldmia up!, { r4, r5, r6, r7 }
- ldmia vp!, { r8, r9, r12, lr }
- ADDSUBC r4, r4, r8
- sub n, n, #4
- ADDSUBC r5, r5, r9
- ADDSUBC r6, r6, r12
- ADDSUBC r7, r7, lr
- stmia rp!, { r4, r5, r6, r7 }
- teq n, #0
- bne L(top)
-
- ldmfd sp!, { r4, r5, r6, r7 }
-
-L(rtn): RETVAL
- ldmfd sp!, { r8, r9, pc }
-EPILOGUE()
diff --git a/gmp/mpn/arm/aorslsh1_n.asm b/gmp/mpn/arm/aorslsh1_n.asm
deleted file mode 100644
index 1cbd4ba1af..0000000000
--- a/gmp/mpn/arm/aorslsh1_n.asm
+++ /dev/null
@@ -1,167 +0,0 @@
-dnl ARM mpn_addlsh1_n and mpn_sublsh1_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C addlsh1_n sublsh1_n
-C cycles/limb cycles/limb
-C StrongARM ? ?
-C XScale ? ?
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 3.12 3.7
-C Cortex-A15 ? ?
-
-C TODO
-C * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
-C The sublsh1_n code could surely be tweaked, its REVCY slows down things
-C very much. If two insns are really needed, it might help to separate them
-C for better micro-parallelism.
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`OPERATION_addlsh1_n', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`SETCY', `cmp $1, #1')
- define(`RETVAL', `adc r0, $1, #2')
- define(`SAVECY', `sbc $1, $2, #0')
- define(`RESTCY', `cmn $1, #1')
- define(`REVCY', `')
- define(`INICYR', `mov $1, #0')
- define(`r10r11', `r11')
- define(`func', mpn_addlsh1_n)
- define(`func_nc', mpn_addlsh1_nc)')
-ifdef(`OPERATION_sublsh1_n', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`SETCY', `rsbs $1, $1, #0')
- define(`RETVAL', `adc r0, $1, #1')
- define(`SAVECY', `sbc $1, $1, $1')
- define(`RESTCY', `cmn $1, #1')
- define(`REVCY', `sbc $1, $1, $1
- cmn $1, #1')
- define(`INICYR', `mvn $1, #0')
- define(`r10r11', `r10')
- define(`func', mpn_sublsh1_n)
- define(`func_nc', mpn_sublsh1_nc)')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
-
-ASM_START()
-PROLOGUE(func)
- push {r4-r10r11, r14}
-
-ifdef(`OPERATION_addlsh1_n', `
- mvn r11, #0
-')
- INICYR( r14)
- subs n, n, #3
- blt L(le2) C carry clear on branch path
-
- cmn r0, #0 C clear carry
- ldmia vp!, {r8, r9, r10}
- b L(mid)
-
-L(top): RESTCY( r14)
- ADDSUBC r4, r4, r8
- ADDSUBC r5, r5, r9
- ADDSUBC r6, r6, r10
- ldmia vp!, {r8, r9, r10}
- stmia rp!, {r4, r5, r6}
- REVCY(r14)
- adcs r8, r8, r8
- adcs r9, r9, r9
- adcs r10, r10, r10
- ldmia up!, {r4, r5, r6}
- SAVECY( r14, r11)
- subs n, n, #3
- blt L(exi)
- RESTCY( r12)
- ADDSUBC r4, r4, r8
- ADDSUBC r5, r5, r9
- ADDSUBC r6, r6, r10
- ldmia vp!, {r8, r9, r10}
- stmia rp!, {r4, r5, r6}
- REVCY(r12)
-L(mid): adcs r8, r8, r8
- adcs r9, r9, r9
- adcs r10, r10, r10
- ldmia up!, {r4, r5, r6}
- SAVECY( r12, r11)
- subs n, n, #3
- bge L(top)
-
- mov r7, r12 C swap alternating...
- mov r12, r14 C ...carry-save...
- mov r14, r7 C ...registers
-
-L(exi): RESTCY( r12)
- ADDSUBC r4, r4, r8
- ADDSUBC r5, r5, r9
- ADDSUBC r6, r6, r10
- stmia rp!, {r4, r5, r6}
-
- REVCY(r12)
-L(le2): tst n, #1 C n = {-1,-2,-3} map to [2], [1], [0]
- beq L(e1)
-
-L(e02): tst n, #2
- beq L(rt0)
- ldm vp, {r8, r9}
- adcs r8, r8, r8
- adcs r9, r9, r9
- ldm up, {r4, r5}
- SAVECY( r12, r11)
- RESTCY( r14)
- ADDSUBC r4, r4, r8
- ADDSUBC r5, r5, r9
- stm rp, {r4, r5}
- b L(rt1)
-
-L(e1): ldr r8, [vp]
- adcs r8, r8, r8
- ldr r4, [up]
- SAVECY( r12, r11)
- RESTCY( r14)
- ADDSUBC r4, r4, r8
- str r4, [rp]
-
-L(rt1): mov r14, r12
- REVCY(r12)
-L(rt0): RETVAL( r14)
- pop {r4-r10r11, r14}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/aorsmul_1.asm b/gmp/mpn/arm/aorsmul_1.asm
deleted file mode 100644
index b02fbb3b2a..0000000000
--- a/gmp/mpn/arm/aorsmul_1.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-dnl ARM mpn_addmul_1 and mpn_submul_1.
-
-dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.25
-C Cortex-A15 4
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`vl', `r3')
-define(`rl', `r12')
-define(`ul', `r6')
-define(`r', `lr')
-
-ifdef(`OPERATION_addmul_1', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`CLRRCY', `mov $1, #0
- adds r0, r0, #0')
- define(`RETVAL', `adc r0, r4, #0')
- define(`func', mpn_addmul_1)')
-ifdef(`OPERATION_submul_1', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`CLRRCY', `subs $1, r0, r0')
- define(`RETVAL', `sbc r0, r0, r0
- sub r0, $1, r0')
- define(`func', mpn_submul_1)')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
-PROLOGUE(func)
- stmfd sp!, { r4-r6, lr }
- CLRRCY( r4)
- tst n, #1
- beq L(skip1)
- ldr ul, [up], #4
- ldr rl, [rp, #0]
- umull r5, r4, ul, vl
- ADDSUB r, rl, r5
- str r, [rp], #4
-L(skip1):
- tst n, #2
- beq L(skip2)
- ldr ul, [up], #4
- ldr rl, [rp, #0]
- mov r5, #0
- umlal r4, r5, ul, vl
- ldr ul, [up], #4
- ADDSUBC r, rl, r4
- ldr rl, [rp, #4]
- mov r4, #0
- umlal r5, r4, ul, vl
- str r, [rp], #4
- ADDSUBC r, rl, r5
- str r, [rp], #4
-L(skip2):
- bics n, n, #3
- beq L(rtn)
-
- ldr ul, [up], #4
- ldr rl, [rp, #0]
- mov r5, #0
- umlal r4, r5, ul, vl
- b L(in)
-
-L(top): ldr ul, [up], #4
- ADDSUBC r, rl, r5
- ldr rl, [rp, #4]
- mov r5, #0
- umlal r4, r5, ul, vl
- str r, [rp], #4
-L(in): ldr ul, [up], #4
- ADDSUBC r, rl, r4
- ldr rl, [rp, #4]
- mov r4, #0
- umlal r5, r4, ul, vl
- str r, [rp], #4
- ldr ul, [up], #4
- ADDSUBC r, rl, r5
- ldr rl, [rp, #4]
- mov r5, #0
- umlal r4, r5, ul, vl
- str r, [rp], #4
- ldr ul, [up], #4
- ADDSUBC r, rl, r4
- ldr rl, [rp, #4]
- mov r4, #0
- umlal r5, r4, ul, vl
- sub n, n, #4
- tst n, n
- str r, [rp], #4
- bne L(top)
-
- ADDSUBC r, rl, r5
- str r, [rp]
-
-L(rtn): RETVAL( r4)
- ldmfd sp!, { r4-r6, pc }
-EPILOGUE()
diff --git a/gmp/mpn/arm/arm-defs.m4 b/gmp/mpn/arm/arm-defs.m4
index 6ca964a245..9d169e822d 100644
--- a/gmp/mpn/arm/arm-defs.m4
+++ b/gmp/mpn/arm/arm-defs.m4
@@ -2,39 +2,28 @@ divert(-1)
dnl m4 macros for ARM assembler.
-dnl Copyright 2001, 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Standard commenting is with @, the default m4 # is for constants and we
dnl don't want to disable macro expansions in or after them.
-changecom(@&*$)
+changecom(@)
dnl APCS register names.
@@ -58,34 +47,4 @@ deflit(sp,r13)
deflit(lr,r14)
deflit(pc,r15)
-
-define(`lea_list', `')
-define(`lea_num',0)
-
-dnl LEA(reg,gmp_symbol)
-dnl
-dnl Load the address of gmp_symbol into a register. The gmp_symbol must be
-dnl either local or protected/hidden, since we assume it has a fixed distance
-dnl from the point of use.
-
-define(`LEA',`dnl
-ldr $1, L(ptr`'lea_num)
-ifdef(`PIC',dnl
-`dnl
-L(bas`'lea_num):dnl
- add $1, $1, pc`'dnl
- m4append(`lea_list',`
-L(ptr'lea_num`): .word GSYM_PREFIX`'$2-L(bas'lea_num`)-8')
- define(`lea_num', eval(lea_num+1))dnl
-',`dnl
- m4append(`lea_list',`
-L(ptr'lea_num`): .word GSYM_PREFIX`'$2')
- define(`lea_num', eval(lea_num+1))dnl
-')dnl
-')
-
-define(`EPILOGUE_cpu',
-`lea_list
- SIZE(`$1',.-`$1')')
-
divert
diff --git a/gmp/mpn/arm/bdiv_dbm1c.asm b/gmp/mpn/arm/bdiv_dbm1c.asm
deleted file mode 100644
index ec3de50e8e..0000000000
--- a/gmp/mpn/arm/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,113 +0,0 @@
-dnl ARM mpn_bdiv_dbm1c.
-
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 4.25
-C Cortex-A15 2.5
-
-C TODO
-C * Try using umlal or umaal.
-C * Try using ldm/stm.
-
-define(`qp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`bd', `r3')
-define(`cy', `sp,#0')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_bdiv_dbm1c)
- push {r4, r5, r6, r7, r8}
- ldr r4, [up], #4
- ldr r5, [sp, #20]
- ands r12, n, #3
- beq L(fi0)
- cmp r12, #2
- bcc L(fi1)
- beq L(fi2)
-
-L(fi3): umull r8, r12, r4, bd
- ldr r4, [up], #4
- b L(lo3)
-
-L(fi0): umull r6, r7, r4, bd
- ldr r4, [up], #4
- b L(lo0)
-
-L(fi1): subs n, n, #1
- umull r8, r12, r4, bd
- bls L(wd1)
- ldr r4, [up], #4
- b L(lo1)
-
-L(fi2): umull r6, r7, r4, bd
- ldr r4, [up], #4
- b L(lo2)
-
-L(top): ldr r4, [up], #4
- subs r5, r5, r6
- str r5, [qp], #4
- sbc r5, r5, r7
-L(lo1): umull r6, r7, r4, bd
- ldr r4, [up], #4
- subs r5, r5, r8
- str r5, [qp], #4
- sbc r5, r5, r12
-L(lo0): umull r8, r12, r4, bd
- ldr r4, [up], #4
- subs r5, r5, r6
- str r5, [qp], #4
- sbc r5, r5, r7
-L(lo3): umull r6, r7, r4, bd
- ldr r4, [up], #4
- subs r5, r5, r8
- str r5, [qp], #4
- sbc r5, r5, r12
-L(lo2): subs n, n, #4
- umull r8, r12, r4, bd
- bhi L(top)
-
-L(wd2): subs r5, r5, r6
- str r5, [qp], #4
- sbc r5, r5, r7
-L(wd1): subs r5, r5, r8
- str r5, [qp]
- sbc r0, r5, r12
- pop {r4, r5, r6, r7, r8}
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/cnd_aors_n.asm b/gmp/mpn/arm/cnd_aors_n.asm
deleted file mode 100644
index e8eb60983a..0000000000
--- a/gmp/mpn/arm/cnd_aors_n.asm
+++ /dev/null
@@ -1,134 +0,0 @@
-dnl ARM mpn_cnd_add_n, mpn_cnd_sub_n
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3
-C Cortex-A15 2.5
-
-define(`cnd', `r0')
-define(`rp', `r1')
-define(`up', `r2')
-define(`vp', `r3')
-
-define(`n', `r12')
-
-
-ifdef(`OPERATION_cnd_add_n', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`INITCY', `cmn r0, #0')
- define(`RETVAL', `adc r0, n, #0')
- define(func, mpn_cnd_add_n)')
-ifdef(`OPERATION_cnd_sub_n', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`INITCY', `cmp r0, #0')
- define(`RETVAL', `adc r0, n, #0
- rsb r0, r0, #1')
- define(func, mpn_cnd_sub_n)')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- push {r4-r11}
- ldr n, [sp, #32]
-
- cmp cnd, #1
- sbc cnd, cnd, cnd C conditionally set to 0xffffffff
-
- INITCY C really only needed for n = 0 (mod 4)
-
- ands r4, n, #3
- beq L(top)
- cmp r4, #2
- bcc L(b1)
- beq L(b2)
-
-L(b3): ldm vp!, {r4,r5,r6}
- ldm up!, {r8,r9,r10}
- bic r4, r4, cnd
- bic r5, r5, cnd
- bic r6, r6, cnd
- ADDSUB r8, r8, r4
- ADDSUBC r9, r9, r5
- ADDSUBC r10, r10, r6
- stm rp!, {r8,r9,r10}
- sub n, n, #3
- teq n, #0
- bne L(top)
- b L(end)
-
-L(b2): ldm vp!, {r4,r5}
- ldm up!, {r8,r9}
- bic r4, r4, cnd
- bic r5, r5, cnd
- ADDSUB r8, r8, r4
- ADDSUBC r9, r9, r5
- stm rp!, {r8,r9}
- sub n, n, #2
- teq n, #0
- bne L(top)
- b L(end)
-
-L(b1): ldr r4, [vp], #4
- ldr r8, [up], #4
- bic r4, r4, cnd
- ADDSUB r8, r8, r4
- str r8, [rp], #4
- sub n, n, #1
- teq n, #0
- beq L(end)
-
-L(top): ldm vp!, {r4,r5,r6,r7}
- ldm up!, {r8,r9,r10,r11}
- bic r4, r4, cnd
- bic r5, r5, cnd
- bic r6, r6, cnd
- bic r7, r7, cnd
- ADDSUBC r8, r8, r4
- ADDSUBC r9, r9, r5
- ADDSUBC r10, r10, r6
- ADDSUBC r11, r11, r7
- sub n, n, #4
- stm rp!, {r8,r9,r10,r11}
- teq n, #0
- bne L(top)
-
-L(end): RETVAL
- pop {r4-r11}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/com.asm b/gmp/mpn/arm/com.asm
deleted file mode 100644
index 42f8e3cbbe..0000000000
--- a/gmp/mpn/arm/com.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-dnl ARM mpn_com.
-
-dnl Copyright 2003, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.0
-C Cortex-A15 1.75
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-
-ASM_START()
-PROLOGUE(mpn_com)
- tst n, #1
- beq L(skip1)
- ldr r3, [up], #4
- mvn r3, r3
- str r3, [rp], #4
-L(skip1):
- tst n, #2
- beq L(skip2)
- ldmia up!, { r3, r12 } C load 2 limbs
- mvn r3, r3
- mvn r12, r12
- stmia rp!, { r3, r12 } C store 2 limbs
-L(skip2):
- bics n, n, #3
- beq L(rtn)
- stmfd sp!, { r7, r8, r9 } C save regs on stack
-
-L(top): ldmia up!, { r3, r8, r9, r12 } C load 4 limbs
- subs n, n, #4
- mvn r3, r3
- mvn r8, r8
- mvn r9, r9
- mvn r12, r12
- stmia rp!, { r3, r8, r9, r12 } C store 4 limbs
- bne L(top)
-
- ldmfd sp!, { r7, r8, r9 } C restore regs from stack
-L(rtn): bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/copyd.asm b/gmp/mpn/arm/copyd.asm
index 3ea2035099..718b762b91 100644
--- a/gmp/mpn/arm/copyd.asm
+++ b/gmp/mpn/arm/copyd.asm
@@ -1,59 +1,37 @@
dnl ARM mpn_copyd.
-dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
-
-dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+dnl Copyright 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.25-1.5
-C Cortex-A15 1.25
+C This runs at 3 cycles/limb in the StrongARM.
-C TODO
-C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9
-C and A15. But it probably slows things down for 8 <= n < a few dozen.
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
ASM_START()
PROLOGUE(mpn_copyd)
mov r12, n, lsl #2
sub r12, r12, #4
- add rp, rp, r12
- add up, up, r12
+ add rp, rp, r12 C make rp point at last limb
+ add up, up, r12 C make up point at last limb
tst n, #1
beq L(skip1)
@@ -62,23 +40,19 @@ PROLOGUE(mpn_copyd)
L(skip1):
tst n, #2
beq L(skip2)
- ldmda up!, { r3,r12 }
- stmda rp!, { r3,r12 }
+ ldmda up!, { r3, r12 } C load 2 limbs
+ stmda rp!, { r3, r12 } C store 2 limbs
L(skip2):
bics n, n, #3
- beq L(rtn)
-
- push { r4-r5 }
+ beq L(return)
+ stmfd sp!, { r7, r8, r9 } C save regs on stack
+L(loop):
+ ldmda up!, { r3, r8, r9, r12 } C load 4 limbs
+ ldr r7, [rp, #-12] C cache allocate
subs n, n, #4
- ldmda up!, { r3,r4,r5,r12 }
- beq L(end)
-
-L(top): subs n, n, #4
- stmda rp!, { r3,r4,r5,r12 }
- ldmda up!, { r3,r4,r5,r12 }
- bne L(top)
-
-L(end): stmda rp, { r3,r4,r5,r12 }
- pop { r4-r5 }
-L(rtn): bx lr
-EPILOGUE()
+ stmda rp!, { r3, r8, r9, r12 } C store 4 limbs
+ bne L(loop)
+ ldmfd sp!, { r7, r8, r9 } C restore regs from stack
+L(return):
+ mov pc, lr
+EPILOGUE(mpn_copyd)
diff --git a/gmp/mpn/arm/copyi.asm b/gmp/mpn/arm/copyi.asm
index fa454702c1..5ee93acd4c 100644
--- a/gmp/mpn/arm/copyi.asm
+++ b/gmp/mpn/arm/copyi.asm
@@ -1,52 +1,30 @@
dnl ARM mpn_copyi.
-dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
-
-dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+dnl Copyright 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.25-1.5
-C Cortex-A15 1.25
+C This runs at 3 cycles/limb in the StrongARM.
-C TODO
-C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9
-C and A15. But it probably slows things down for 8 <= n < a few dozen.
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
ASM_START()
PROLOGUE(mpn_copyi)
@@ -57,23 +35,19 @@ PROLOGUE(mpn_copyi)
L(skip1):
tst n, #2
beq L(skip2)
- ldmia up!, { r3,r12 }
- stmia rp!, { r3,r12 }
+ ldmia up!, { r3, r12 } C load 2 limbs
+ stmia rp!, { r3, r12 } C store 2 limbs
L(skip2):
bics n, n, #3
- beq L(rtn)
-
- push { r4-r5 }
+ beq L(return)
+ stmfd sp!, { r7, r8, r9 } C save regs on stack
+L(loop):
+ ldmia up!, { r3, r8, r9, r12 } C load 4 limbs
+ ldr r7, [rp, #12] C cache allocate
subs n, n, #4
- ldmia up!, { r3,r4,r5,r12 }
- beq L(end)
-
-L(top): subs n, n, #4
- stmia rp!, { r3,r4,r5,r12 }
- ldmia up!, { r3,r4,r5,r12 }
- bne L(top)
-
-L(end): stm rp, { r3,r4,r5,r12 }
- pop { r4-r5 }
-L(rtn): bx lr
-EPILOGUE()
+ stmia rp!, { r3, r8, r9, r12 } C store 4 limbs
+ bne L(loop)
+ ldmfd sp!, { r7, r8, r9 } C restore regs from stack
+L(return):
+ mov pc, lr
+EPILOGUE(mpn_copyi)
diff --git a/gmp/mpn/arm/dive_1.asm b/gmp/mpn/arm/dive_1.asm
deleted file mode 100644
index a695e47c77..0000000000
--- a/gmp/mpn/arm/dive_1.asm
+++ /dev/null
@@ -1,151 +0,0 @@
-dnl ARM v4 mpn_modexact_1c_odd
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C norm unorm modexact_1c_odd
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 10 12
-C Cortex-A15 9 9
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te -
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`d', `r3')
-
-define(`cy', `r7')
-define(`cnt', `r6')
-define(`tnc', `r8')
-
-ASM_START()
-PROLOGUE(mpn_divexact_1)
- tst d, #1
- push {r4-r9}
- mov cnt, #0
- bne L(inv)
-
-C count trailing zeros
- movs r4, d, lsl #16
- moveq d, d, lsr #16
- moveq cnt, #16
- tst d, #0xff
- moveq d, d, lsr #8
- addeq cnt, cnt, #8
- LEA( r4, ctz_tab)
- and r5, d, #0xff
- ldrb r4, [r4, r5]
- mov d, d, lsr r4
- add cnt, cnt, r4
-
-C binvert limb
-L(inv): LEA( r4, binvert_limb_table)
- and r12, d, #254
- ldrb r4, [r4, r12, lsr #1]
- mul r12, r4, r4
- mul r12, d, r12
- rsb r12, r12, r4, lsl #1
- mul r4, r12, r12
- mul r4, d, r4
- rsb r4, r4, r12, lsl #1 C r4 = inverse
-
- tst cnt, cnt
- ldr r5, [up], #4 C up[0]
- mov cy, #0
- bne L(unnorm)
-
-L(norm):
- subs n, n, #1 C set carry as side-effect
- beq L(end)
-
- ALIGN(16)
-L(top): sbcs cy, r5, cy
- ldr r5, [up], #4
- sub n, n, #1
- mul r9, r4, cy
- tst n, n
- umull r12, cy, d, r9
- str r9, [rp], #4
- bne L(top)
-
-L(end): sbc cy, r5, cy
- mul r9, r4, cy
- str r9, [rp]
- pop {r4-r9}
- bx r14
-
-L(unnorm):
- rsb tnc, cnt, #32
- mov r5, r5, lsr cnt
- subs n, n, #1 C set carry as side-effect
- beq L(edu)
-
- ALIGN(16)
-L(tpu): ldr r12, [up], #4
- orr r9, r5, r12, lsl tnc
- mov r5, r12, lsr cnt
- sbcs cy, r9, cy C critical path ->cy->cy->
- sub n, n, #1
- mul r9, r4, cy C critical path ->cy->r9->
- tst n, n
- umull r12, cy, d, r9 C critical path ->r9->cy->
- str r9, [rp], #4
- bne L(tpu)
-
-L(edu): sbc cy, r5, cy
- mul r9, r4, cy
- str r9, [rp]
- pop {r4-r9}
- bx r14
-EPILOGUE()
-
- .section .rodata
-ctz_tab:
- .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
- .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
diff --git a/gmp/mpn/arm/gmp-mparam.h b/gmp/mpn/arm/gmp-mparam.h
index 87eec3a149..7afb06ac89 100644
--- a/gmp/mpn/arm/gmp-mparam.h
+++ b/gmp/mpn/arm/gmp-mparam.h
@@ -1,127 +1,75 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1193MHz ARM (gcc55.fsffrance.org) */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 56
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 71
-#define USE_PREINV_DIVREM_1 1 /* preinv always */
-#define DIVREM_2_THRESHOLD 0 /* preinv always */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 41
-
-#define MUL_TOOM22_THRESHOLD 36
-#define MUL_TOOM33_THRESHOLD 125
-#define MUL_TOOM44_THRESHOLD 193
-#define MUL_TOOM6H_THRESHOLD 303
-#define MUL_TOOM8H_THRESHOLD 418
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 125
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 176
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
-
-#define SQR_BASECASE_THRESHOLD 12
-#define SQR_TOOM2_THRESHOLD 78
-#define SQR_TOOM3_THRESHOLD 137
-#define SQR_TOOM4_THRESHOLD 212
-#define SQR_TOOM6_THRESHOLD 306
-#define SQR_TOOM8_THRESHOLD 422
-
-#define MULMOD_BNM1_THRESHOLD 20
-#define SQRMOD_BNM1_THRESHOLD 26
-
-#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 436, 5}, { 27, 6}, { 28, 7}, { 15, 6}, \
- { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 256, 9}, { 512,10}, { 1024,11}, { 2048,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 28
-#define MUL_FFT_THRESHOLD 5760
-
-#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 404, 5}, { 13, 4}, { 27, 5}, { 27, 6}, \
- { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 512,10}, \
- { 1024,11}, { 2048,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 26
-#define SQR_FFT_THRESHOLD 3776
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 137
-#define MULLO_MUL_N_THRESHOLD 11479
-
-#define DC_DIV_QR_THRESHOLD 150
-#define DC_DIVAPPR_Q_THRESHOLD 494
-#define DC_BDIV_QR_THRESHOLD 148
-#define DC_BDIV_Q_THRESHOLD 345
-
-#define INV_MULMOD_BNM1_THRESHOLD 70
-#define INV_NEWTON_THRESHOLD 474
-#define INV_APPR_THRESHOLD 478
-
-#define BINV_NEWTON_THRESHOLD 542
-#define REDC_1_TO_REDC_N_THRESHOLD 117
-
-#define MU_DIV_QR_THRESHOLD 2089
-#define MU_DIVAPPR_Q_THRESHOLD 2172
-#define MUPI_DIV_QR_THRESHOLD 225
-#define MU_BDIV_QR_THRESHOLD 1528
-#define MU_BDIV_Q_THRESHOLD 2089
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 197
-#define GCD_DC_THRESHOLD 902
-#define GCDEXT_DC_THRESHOLD 650
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 20
-#define GET_STR_PRECOMPUTE_THRESHOLD 39
-#define SET_STR_DC_THRESHOLD 1045
-#define SET_STR_PRECOMPUTE_THRESHOLD 2147
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* 593MHz ARM (gcc50.fsffrance.org) */
+
+/* Generated by tuneup.c, 2009-03-05, gcc 4.3 */
+
+#define MUL_KARATSUBA_THRESHOLD 34
+#define MUL_TOOM3_THRESHOLD 125
+#define MUL_TOOM44_THRESHOLD 184
+
+#define SQR_BASECASE_THRESHOLD 15
+#define SQR_KARATSUBA_THRESHOLD 82
+#define SQR_TOOM3_THRESHOLD 147
+#define SQR_TOOM4_THRESHOLD 212
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 128
+#define MULLOW_MUL_N_THRESHOLD 1095
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
+#define DIV_DC_THRESHOLD 130
+#define POWM_THRESHOLD 200
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 110
+#define GCD_DC_THRESHOLD 734
+#define GCDEXT_DC_THRESHOLD 748
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 10
+#define MOD_1_2_THRESHOLD 996
+#define MOD_1_4_THRESHOLD 997
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define USE_PREINV_MOD_1 1 /* preinv always */
+#define DIVREM_2_THRESHOLD 0 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 35
+#define SET_STR_DC_THRESHOLD 321
+#define SET_STR_PRECOMPUTE_THRESHOLD 1057
+
+#define MUL_FFT_TABLE { 400, 928, 1920, 4608, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 416
+#define MUL_FFT_THRESHOLD 5888
+
+#define SQR_FFT_TABLE { 432, 928, 1664, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 376
+#define SQR_FFT_THRESHOLD 4352
diff --git a/gmp/mpn/arm/invert_limb.asm b/gmp/mpn/arm/invert_limb.asm
index d4c3afe2da..39d3bb2e14 100644
--- a/gmp/mpn/arm/invert_limb.asm
+++ b/gmp/mpn/arm/invert_limb.asm
@@ -1,93 +1,95 @@
dnl ARM mpn_invert_limb -- Invert a normalized limb.
-dnl Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-ASM_START()
+C INPUT PARAMETERS
+define(`d',`r0') C number to be inverted
+
+
PROLOGUE(mpn_invert_limb)
- LEA( r2, approx_tab-512)
- mov r3, r0, lsr #23
- mov r3, r3, asl #1
- ldrh r3, [r3, r2]
- mov r1, r3, asl #17
- mul r12, r3, r3
- umull r3, r2, r12, r0
- sub r1, r1, r2, asl #1
- umull r3, r2, r1, r1
- umull r12, r3, r0, r3
- umull r2, r12, r0, r2
- adds r2, r2, r3
- adc r12, r12, #0
- rsb r1, r12, r1
- mvn r2, r2, lsr #30
- add r2, r2, r1, asl #2
- umull r12, r3, r0, r2
- adds r1, r12, r0
- adc r3, r3, r0
- rsb r0, r3, r2
- bx lr
-EPILOGUE()
+ stmfd sp!, {r4, lr}
+ mov r3, d, lsr #23
+ sub r3, r3, #256
+ add r2, pc, #invtab-.-8
+ mov r3, r3, lsl #1
+ ldrh r1, [r2, r3] C get initial approximation from table
+ mov r2, r1, lsl #6 C start iteration 1
+ mul ip, r2, r2
+ umull lr, r4, ip, d
+ mov r2, r4, lsl #1
+ rsb r2, r2, r1, lsl #23 C iteration 1 complete
+ umull ip, r3, r2, r2 C start iteration 2
+ umull lr, r4, r3, d
+ umull r3, r1, ip, d
+ adds lr, lr, r1
+ addcs r4, r4, #1
+ mov r3, lr, lsr #30
+ orr r4, r3, r4, lsl #2
+ mov lr, lr, lsl #2
+ cmn lr, #1
+ rsc r2, r4, r2, lsl #2 C iteration 2 complete
+ umull ip, r1, d, r2 C start adjustment step
+ add r1, r1, d
+ cmn r1, #1
+ beq L(1)
+ adds ip, ip, d
+ adc r1, r1, #0
+ add r2, r2, #1
+L(1):
+ adds r3, ip, d
+ adcs r1, r1, #0
+ moveq r0, r2
+ addne r0, r2, #1
+ ldmfd sp!, {r4, pc}
- .section .rodata
- ALIGN(2)
-approx_tab:
- .short 0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900
- .short 0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180
- .short 0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0
- .short 0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400
- .short 0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00
- .short 0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800
- .short 0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280
- .short 0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40
- .short 0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840
- .short 0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380
- .short 0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0
- .short 0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80
- .short 0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640
- .short 0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240
- .short 0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80
- .short 0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0
- .short 0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740
- .short 0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400
- .short 0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0
- .short 0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0
- .short 0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0
- .short 0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0
- .short 0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500
- .short 0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240
- .short 0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0
- .short 0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40
- .short 0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00
- .short 0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880
- .short 0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640
- .short 0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440
- .short 0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200
- .short 0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000
-ASM_END()
+invtab:
+ .short 1023,1020,1016,1012,1008,1004,1000,996
+ .short 992,989,985,981,978,974,970,967
+ .short 963,960,956,953,949,946,942,939
+ .short 936,932,929,926,923,919,916,913
+ .short 910,907,903,900,897,894,891,888
+ .short 885,882,879,876,873,870,868,865
+ .short 862,859,856,853,851,848,845,842
+ .short 840,837,834,832,829,826,824,821
+ .short 819,816,814,811,809,806,804,801
+ .short 799,796,794,791,789,787,784,782
+ .short 780,777,775,773,771,768,766,764
+ .short 762,759,757,755,753,751,748,746
+ .short 744,742,740,738,736,734,732,730
+ .short 728,726,724,722,720,718,716,714
+ .short 712,710,708,706,704,702,700,699
+ .short 697,695,693,691,689,688,686,684
+ .short 682,680,679,677,675,673,672,670
+ .short 668,667,665,663,661,660,658,657
+ .short 655,653,652,650,648,647,645,644
+ .short 642,640,639,637,636,634,633,631
+ .short 630,628,627,625,624,622,621,619
+ .short 618,616,615,613,612,611,609,608
+ .short 606,605,604,602,601,599,598,597
+ .short 595,594,593,591,590,589,587,586
+ .short 585,583,582,581,579,578,577,576
+ .short 574,573,572,571,569,568,567,566
+ .short 564,563,562,561,560,558,557,556
+ .short 555,554,553,551,550,549,548,547
+ .short 546,544,543,542,541,540,539,538
+ .short 537,536,534,533,532,531,530,529
+ .short 528,527,526,525,524,523,522,521
+ .short 520,519,518,517,516,515,514,513
+EPILOGUE(mpn_invert_limb)
diff --git a/gmp/mpn/arm/logops_n.asm b/gmp/mpn/arm/logops_n.asm
deleted file mode 100644
index 5a61683fc2..0000000000
--- a/gmp/mpn/arm/logops_n.asm
+++ /dev/null
@@ -1,139 +0,0 @@
-dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C and andn ior xor nand iorn nior xnor
-C StrongARM ? ?
-C XScale ? ?
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 2.5-2.72 2.75-3
-C Cortex-A15 2.25 2.75
-
-C TODO
-C * It seems that 2.25 c/l and 2.75 c/l is possible for A9.
-C * Debug popping issue, see comment below.
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-define(`POSTOP')
-
-ifdef(`OPERATION_and_n',`
- define(`func', `mpn_and_n')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_andn_n',`
- define(`func', `mpn_andn_n')
- define(`LOGOP', `bic $1, $2, $3')')
-ifdef(`OPERATION_nand_n',`
- define(`func', `mpn_nand_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_ior_n',`
- define(`func', `mpn_ior_n')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_iorn_n',`
- define(`func', `mpn_iorn_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `bic $1, $3, $2')')
-ifdef(`OPERATION_nior_n',`
- define(`func', `mpn_nior_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_xor_n',`
- define(`func', `mpn_xor_n')
- define(`LOGOP', `eor $1, $2, $3')')
-ifdef(`OPERATION_xnor_n',`
- define(`func', `mpn_xnor_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `eor $1, $2, $3')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-ASM_START()
-PROLOGUE(func)
- push { r8, r9, r10 }
- tst n, #1
- beq L(skip1)
- ldr r10, [vp], #4
- ldr r12, [up], #4
- LOGOP( r12, r12, r10)
- POSTOP( r12)
- str r12, [rp], #4
-L(skip1):
- tst n, #2
- beq L(skip2)
- ldmia vp!, { r10, r12 }
- ldmia up!, { r8, r9 }
- LOGOP( r8, r8, r10)
- LOGOP( r9, r9, r12)
- POSTOP( r8)
- POSTOP( r9)
- stmia rp!, { r8, r9 }
-L(skip2):
- bics n, n, #3
- beq L(rtn)
- push { r4, r5, r6, r7 }
-
- ldmia vp!, { r8, r9, r10, r12 }
- b L(mid)
-
-L(top): ldmia vp!, { r8, r9, r10, r12 }
- POSTOP( r4)
- POSTOP( r5)
- POSTOP( r6)
- POSTOP( r7)
- stmia rp!, { r4, r5, r6, r7 }
-L(mid): sub n, n, #4
- ldmia up!, { r4, r5, r6, r7 }
- teq n, #0
- LOGOP( r4, r4, r8)
- LOGOP( r5, r5, r9)
- LOGOP( r6, r6, r10)
- LOGOP( r7, r7, r12)
- bne L(top)
-
- POSTOP( r4)
- POSTOP( r5)
- POSTOP( r6)
- POSTOP( r7)
- stmia rp!, { r4, r5, r6, r7 }
-
- pop { r4, r5, r6, r7 } C popping r8-r10 here strangely fails
-
-L(rtn): pop { r8, r9, r10 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/lshift.asm b/gmp/mpn/arm/lshift.asm
deleted file mode 100644
index 9f777eb4dd..0000000000
--- a/gmp/mpn/arm/lshift.asm
+++ /dev/null
@@ -1,88 +0,0 @@
-dnl ARM mpn_lshift.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.5
-C Cortex-A15 ?
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`cnt', `r3')
-define(`tnc', `r12')
-
-ASM_START()
-PROLOGUE(mpn_lshift)
- add up, up, n, lsl #2
- push {r4, r6, r7, r8}
- ldr r4, [up, #-4]!
- add rp, rp, n, lsl #2
- rsb tnc, cnt, #32
-
- mov r7, r4, lsl cnt
- tst n, #1
- beq L(evn) C n even
-
-L(odd): subs n, n, #2
- bcc L(1) C n = 1
- ldr r8, [up, #-4]!
- b L(mid)
-
-L(evn): ldr r6, [up, #-4]!
- subs n, n, #2
- beq L(end)
-
-L(top): ldr r8, [up, #-4]!
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(mid): ldr r6, [up, #-4]!
- orr r7, r7, r8, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r8, lsl cnt
- subs n, n, #2
- bgt L(top)
-
-L(end): orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(1): str r7, [rp, #-4]
- mov r0, r4, lsr tnc
- pop {r4, r6, r7, r8}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/lshiftc.asm b/gmp/mpn/arm/lshiftc.asm
deleted file mode 100644
index 5f3d6e3f5b..0000000000
--- a/gmp/mpn/arm/lshiftc.asm
+++ /dev/null
@@ -1,95 +0,0 @@
-dnl ARM mpn_lshiftc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 4.0
-C Cortex-A15 ?
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`cnt', `r3')
-define(`tnc', `r12')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- add up, up, n, lsl #2
- push {r4, r6, r7, r8}
- ldr r4, [up, #-4]!
- add rp, rp, n, lsl #2
- rsb tnc, cnt, #32
- mvn r6, r4
-
- mov r7, r6, lsl cnt
- tst n, #1
- beq L(evn) C n even
-
-L(odd): subs n, n, #2
- bcc L(1) C n = 1
- ldr r8, [up, #-4]!
- mvn r8, r8
- b L(mid)
-
-L(evn): ldr r6, [up, #-4]!
- mvn r6, r6
- subs n, n, #2
- beq L(end)
-
-L(top): ldr r8, [up, #-4]!
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mvn r8, r8
- mov r7, r6, lsl cnt
-L(mid): ldr r6, [up, #-4]!
- orr r7, r7, r8, lsr tnc
- str r7, [rp, #-4]!
- mvn r6, r6
- mov r7, r8, lsl cnt
- subs n, n, #2
- bgt L(top)
-
-L(end): orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(1): mvn r6, #0
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]
- mov r0, r4, lsr tnc
- pop {r4, r6, r7, r8}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/mod_34lsub1.asm b/gmp/mpn/arm/mod_34lsub1.asm
deleted file mode 100644
index ba3c06d8db..0000000000
--- a/gmp/mpn/arm/mod_34lsub1.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.33
-C Cortex-A15 1.33
-
-define(`ap', r0)
-define(`n', r1)
-
-C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
-
-C TODO
-C * Write cleverer summation code.
-C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mod_34lsub1)
- push { r4, r5, r6, r7 }
-
- subs n, n, #3
- mov r7, #0
- blt L(le2) C n <= 2
-
- ldmia ap!, { r2, r3, r12 }
- subs n, n, #3
- blt L(sum) C n <= 5
- cmn r0, #0 C clear carry
- sub n, n, #3
- b L(mid)
-
-L(top): adcs r2, r2, r4
- adcs r3, r3, r5
- adcs r12, r12, r6
-L(mid): ldmia ap!, { r4, r5, r6 }
- tst n, n
- sub n, n, #3
- bpl L(top)
-
- add n, n, #3
-
- adcs r2, r2, r4
- adcs r3, r3, r5
- adcs r12, r12, r6
- movcs r7, #1 C r7 <= 1
-
-L(sum): cmn n, #2
- movlo r4, #0
- ldrhs r4, [ap], #4
- movls r5, #0
- ldrhi r5, [ap], #4
-
- adds r2, r2, r4
- adcs r3, r3, r5
- adcs r12, r12, #0
- adc r7, r7, #0 C r7 <= 2
-
-L(sum2):
- bic r0, r2, #0xff000000
- add r0, r0, r2, lsr #24
- add r0, r0, r7
-
- mov r7, r3, lsl #8
- bic r1, r7, #0xff000000
- add r0, r0, r1
- add r0, r0, r3, lsr #16
-
- mov r7, r12, lsl #16
- bic r1, r7, #0xff000000
- add r0, r0, r1
- add r0, r0, r12, lsr #8
-
- pop { r4, r5, r6, r7 }
- bx lr
-
-L(le2): cmn n, #1
- bne L(1)
- ldmia ap!, { r2, r3 }
- mov r12, #0
- b L(sum2)
-L(1): ldr r2, [ap]
- bic r0, r2, #0xff000000
- add r0, r0, r2, lsr #24
- pop { r4, r5, r6, r7 }
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/mode1o.asm b/gmp/mpn/arm/mode1o.asm
deleted file mode 100644
index 5e0f78fc8f..0000000000
--- a/gmp/mpn/arm/mode1o.asm
+++ /dev/null
@@ -1,92 +0,0 @@
-dnl ARM mpn_modexact_1c_odd
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 10
-C Cortex-A15 9
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te -
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`up', `r0')
-define(`n', `r1')
-define(`d', `r2')
-define(`cy', `r3')
-
- .protected binvert_limb_table
-ASM_START()
-PROLOGUE(mpn_modexact_1c_odd)
- stmfd sp!, {r4, r5}
-
- LEA( r4, binvert_limb_table)
-
- ldr r5, [up], #4 C up[0]
-
- and r12, d, #254
- ldrb r4, [r4, r12, lsr #1]
- mul r12, r4, r4
- mul r12, d, r12
- rsb r12, r12, r4, asl #1
- mul r4, r12, r12
- mul r4, d, r4
- rsb r4, r4, r12, asl #1 C r4 = inverse
-
- subs n, n, #1 C set carry as side-effect
- beq L(end)
-
-L(top): sbcs cy, r5, cy
- ldr r5, [up], #4
- sub n, n, #1
- mul r12, r4, cy
- tst n, n
- umull r12, cy, d, r12
- bne L(top)
-
-L(end): sbcs cy, r5, cy
- mul r12, r4, cy
- umull r12, r0, d, r12
- addcc r0, r0, #1
-
- ldmfd sp!, {r4, r5}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/mul_1.asm b/gmp/mpn/arm/mul_1.asm
index f7bc1bc386..e867351881 100644
--- a/gmp/mpn/arm/mul_1.asm
+++ b/gmp/mpn/arm/mul_1.asm
@@ -2,43 +2,28 @@ dnl ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result
dnl in a second limb vector.
dnl Contributed by Robert Harley.
-dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C StrongARM 6-8
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 4.75
-C Cortex-A15 ?
+C cycles/limb
+C StrongARM: 6-8 (dependent on vl value)
+C XScale: ?-?
C We should rewrite this along the lines of addmul_1.asm. That should save a
C cycle on StrongARM, and several cycles on XScale.
@@ -69,10 +54,10 @@ L(skip1):
stmia rp!, { r8, r9 }
L(skip2):
bics n, n, #3
- beq L(rtn)
+ beq L(return)
stmfd sp!, { r6, r7 }
-
-L(top): mov r6, r12
+L(loop):
+ mov r6, r12
ldmia up!, { r8, r9, r12, lr }
ldr r7, [rp, #12] C cache allocate
mov r7, #0
@@ -85,10 +70,9 @@ L(top): mov r6, r12
umlal r9, r12, lr, vl
subs n, n, #4
stmia rp!, { r6, r7, r8, r9 }
- bne L(top)
-
+ bne L(loop)
ldmfd sp!, { r6, r7 }
-
-L(rtn): mov r0, r12
+L(return):
+ mov r0, r12
ldmfd sp!, { r8, r9, pc }
-EPILOGUE()
+EPILOGUE(mpn_mul_1)
diff --git a/gmp/mpn/arm/neon/README b/gmp/mpn/arm/neon/README
deleted file mode 100644
index 79e3b48ee6..0000000000
--- a/gmp/mpn/arm/neon/README
+++ /dev/null
@@ -1,2 +0,0 @@
-This directory contains Neon code which runs and is efficient on all
-ARM CPUs which support Neon.
diff --git a/gmp/mpn/arm/neon/hamdist.asm b/gmp/mpn/arm/neon/hamdist.asm
deleted file mode 100644
index 232089647d..0000000000
--- a/gmp/mpn/arm/neon/hamdist.asm
+++ /dev/null
@@ -1,194 +0,0 @@
-dnl ARM Neon mpn_hamdist -- mpn bit hamming distance.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.89
-C Cortex-A15 0.95
-
-C TODO
-C * Explore using vldr and vldm. Does it help on A9? (These loads do
-C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
-C popcount. Except perhaps also for popcount for the edge loads.)
-C * Arrange to align the pointer, if that helps performance. Use the same
-C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
-C valgrind!)
-C * Explore if explicit align directives, e.g., "[ptr:128]" help.
-C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
-
-C INPUT PARAMETERS
-define(`ap', r0)
-define(`bp', r1)
-define(`n', r2)
-
-C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
-C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
-C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
-C can be represented as a 8-bit ARM constant.
-C
-define(`chunksize',0x3f80)
-
-ASM_START()
-PROLOGUE(mpn_hamdist)
-
- cmp n, #chunksize
- bhi L(gt16k)
-
-L(lt16k):
- vmov.i64 q8, #0 C clear summation register
- vmov.i64 q9, #0 C clear summation register
-
- tst n, #1
- beq L(xxx0)
- vmov.i64 d0, #0
- vmov.i64 d20, #0
- sub n, n, #1
- vld1.32 {d0[0]}, [ap]! C load 1 limb
- vld1.32 {d20[0]}, [bp]! C load 1 limb
- veor d0, d0, d20
- vcnt.8 d24, d0
- vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
-
-L(xxx0):tst n, #2
- beq L(xx00)
- sub n, n, #2
- vld1.32 {d0}, [ap]! C load 2 limbs
- vld1.32 {d20}, [bp]! C load 2 limbs
- veor d0, d0, d20
- vcnt.8 d24, d0
- vpadal.u8 d16, d24
-
-L(xx00):tst n, #4
- beq L(x000)
- sub n, n, #4
- vld1.32 {q0}, [ap]! C load 4 limbs
- vld1.32 {q10}, [bp]! C load 4 limbs
- veor q0, q0, q10
- vcnt.8 q12, q0
- vpadal.u8 q8, q12
-
-L(x000):tst n, #8
- beq L(0000)
-
- subs n, n, #8
- vld1.32 {q0,q1}, [ap]! C load 8 limbs
- vld1.32 {q10,q11}, [bp]! C load 8 limbs
- bls L(sum)
-
-L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
- vld1.32 {q14,q15}, [bp]! C load 8 limbs
- veor q0, q0, q10
- veor q1, q1, q11
- sub n, n, #8
- vcnt.8 q12, q0
- vcnt.8 q13, q1
- b L(mid)
-
-L(0000):subs n, n, #16
- blo L(e0)
-
- vld1.32 {q2,q3}, [ap]! C load 8 limbs
- vld1.32 {q0,q1}, [ap]! C load 8 limbs
- vld1.32 {q14,q15}, [bp]! C load 8 limbs
- vld1.32 {q10,q11}, [bp]! C load 8 limbs
- veor q2, q2, q14
- veor q3, q3, q15
- vcnt.8 q12, q2
- vcnt.8 q13, q3
- subs n, n, #16
- blo L(end)
-
-L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
- vld1.32 {q14,q15}, [bp]! C load 8 limbs
- veor q0, q0, q10
- veor q1, q1, q11
- vpadal.u8 q8, q12
- vcnt.8 q12, q0
- vpadal.u8 q9, q13
- vcnt.8 q13, q1
-L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
- vld1.32 {q10,q11}, [bp]! C load 8 limbs
- veor q2, q2, q14
- veor q3, q3, q15
- subs n, n, #16
- vpadal.u8 q8, q12
- vcnt.8 q12, q2
- vpadal.u8 q9, q13
- vcnt.8 q13, q3
- bhs L(top)
-
-L(end): vpadal.u8 q8, q12
- vpadal.u8 q9, q13
-L(sum): veor q0, q0, q10
- veor q1, q1, q11
- vcnt.8 q12, q0
- vcnt.8 q13, q1
- vpadal.u8 q8, q12
- vpadal.u8 q9, q13
- vadd.i16 q8, q8, q9
- C we have 8 16-bit counts
-L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
- vpaddl.u32 q8, q8 C we have 2 64-bit counts
- vmov.32 r0, d16[0]
- vmov.32 r1, d17[0]
- add r0, r0, r1
- bx lr
-
-C Code for large count. Splits operand and calls above code.
-define(`ap2', r5)
-define(`bp2', r6)
-L(gt16k):
- push {r4,r5,r6,r14}
- mov ap2, ap
- mov bp2, bp
- mov r3, n C full count
- mov r4, #0 C total sum
-
-1: mov n, #chunksize C count for this invocation
- bl L(lt16k) C could jump deep inside code
- add ap2, ap2, #chunksize*4 C point at next chunk
- add bp2, bp2, #chunksize*4 C point at next chunk
- add r4, r4, r0
- mov ap, ap2 C put chunk pointer in place for call
- mov bp, bp2 C put chunk pointer in place for call
- sub r3, r3, #chunksize
- cmp r3, #chunksize
- bhi 1b
-
- mov n, r3 C count for final invocation
- bl L(lt16k)
- add r0, r4, r0
- pop {r4,r5,r6,pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lorrshift.asm b/gmp/mpn/arm/neon/lorrshift.asm
deleted file mode 100644
index 3d6253fd49..0000000000
--- a/gmp/mpn/arm/neon/lorrshift.asm
+++ /dev/null
@@ -1,279 +0,0 @@
-dnl ARM Neon mpn_lshift and mpn_rshift.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C StrongARM - -
-C XScale - -
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 3 3 Y
-C Cortex-A15 1.5 1.5 Y
-
-
-C We read 64 bits at a time at 32-bit aligned addresses, and except for the
-C first and last store, we write using 64-bit aligned addresses. All shifting
-C is done on 64-bit words in 'extension' registers.
-C
-C It should be possible to read also using 64-bit alignment, by manipulating
-C the shift count for unaligned operands. Not done, since it does not seem to
-C matter for A9 or A15.
-C
-C This will not work in big-endian mode.
-
-C TODO
-C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
-C which might make it tricky.
-C * Clean up and simplify.
-C * Consider sharing most of the code for lshift and rshift, since the feed-in code,
-C the loop, and most of the wind-down code are identical.
-C * Replace the basecase code with code using 'extension' registers.
-C * Optimise. It is not clear that this loop insn permutation is optimal for
-C either A9 or A15.
-
-C INPUT PARAMETERS
-define(`rp', `r0')
-define(`ap', `r1')
-define(`n', `r2')
-define(`cnt', `r3')
-
-ifdef(`OPERATION_lshift',`
- define(`IFLSH', `$1')
- define(`IFRSH', `')
- define(`X',`0')
- define(`Y',`1')
- define(`func',`mpn_lshift')
-')
-ifdef(`OPERATION_rshift',`
- define(`IFLSH', `')
- define(`IFRSH', `$1')
- define(`X',`1')
- define(`Y',`0')
- define(`func',`mpn_rshift')
-')
-
-MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(func)
-IFLSH(` mov r12, n, lsl #2 ')
-IFLSH(` add rp, rp, r12 ')
-IFLSH(` add ap, ap, r12 ')
-
- cmp n, #4 C SIMD code n limit
- ble L(base)
-
-ifdef(`OPERATION_lshift',`
- vdup.32 d6, r3 C left shift count is positive
- sub r3, r3, #64 C right shift count is negative
- vdup.32 d7, r3
- mov r12, #-8') C lshift pointer update offset
-ifdef(`OPERATION_rshift',`
- rsb r3, r3, #0 C right shift count is negative
- vdup.32 d6, r3
- add r3, r3, #64 C left shift count is positive
- vdup.32 d7, r3
- mov r12, #8') C rshift pointer update offset
-
-IFLSH(` sub ap, ap, #8 ')
- vld1.32 {d19}, [ap], r12 C load initial 2 limbs
- vshl.u64 d18, d19, d7 C retval
-
- tst rp, #4 C is rp 64-bit aligned already?
- beq L(rp_aligned) C yes, skip
-IFLSH(` add ap, ap, #4 ') C move back ap pointer
-IFRSH(` sub ap, ap, #4 ') C move back ap pointer
- vshl.u64 d4, d19, d6
- sub n, n, #1 C first limb handled
-IFLSH(` sub rp, rp, #4 ')
- vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
- vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
-
-L(rp_aligned):
-IFLSH(` sub rp, rp, #8 ')
- subs n, n, #6
- blt L(two_or_three_more)
- tst n, #2
- beq L(2)
-
-L(1): vld1.32 {d17}, [ap], r12
- vshl.u64 d5, d19, d6
- vld1.32 {d16}, [ap], r12
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- sub n, n, #2
- b L(mid)
-
-L(2): vld1.32 {d16}, [ap], r12
- vshl.u64 d4, d19, d6
- vld1.32 {d17}, [ap], r12
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- subs n, n, #4
- blt L(end)
-
-L(top): vld1.32 {d16}, [ap], r12
- vorr d2, d4, d1
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- vst1.32 {d2}, [rp:64], r12
-L(mid): vld1.32 {d17}, [ap], r12
- vorr d3, d5, d0
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- vst1.32 {d3}, [rp:64], r12
- subs n, n, #4
- bge L(top)
-
-L(end): tst n, #1
- beq L(evn)
-
- vorr d2, d4, d1
- vst1.32 {d2}, [rp:64], r12
- b L(cj1)
-
-L(evn): vorr d2, d4, d1
- vshl.u64 d0, d17, d7
- vshl.u64 d16, d17, d6
- vst1.32 {d2}, [rp:64], r12
- vorr d2, d5, d0
- b L(cj2)
-
-C Load last 2 - 3 limbs, store last 4 - 5 limbs
-L(two_or_three_more):
- tst n, #1
- beq L(l2)
-
-L(l3): vshl.u64 d5, d19, d6
- vld1.32 {d17}, [ap], r12
-L(cj1): veor d16, d16, d16
-IFLSH(` add ap, ap, #4 ')
- vld1.32 {d16[Y]}, [ap], r12
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- vorr d3, d5, d0
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- vst1.32 {d3}, [rp:64], r12
- vorr d2, d4, d1
- vst1.32 {d2}, [rp:64], r12
-IFLSH(` add rp, rp, #4 ')
- vst1.32 {d5[Y]}, [rp]
- vmov.32 r0, d18[X]
- bx lr
-
-L(l2): vld1.32 {d16}, [ap], r12
- vshl.u64 d4, d19, d6
- vshl.u64 d1, d16, d7
- vshl.u64 d16, d16, d6
- vorr d2, d4, d1
-L(cj2): vst1.32 {d2}, [rp:64], r12
- vst1.32 {d16}, [rp]
- vmov.32 r0, d18[X]
- bx lr
-
-
-define(`tnc', `r12')
-L(base):
- push {r4, r6, r7, r8}
-ifdef(`OPERATION_lshift',`
- ldr r4, [ap, #-4]!
- rsb tnc, cnt, #32
-
- mov r7, r4, lsl cnt
- tst n, #1
- beq L(ev) C n even
-
-L(od): subs n, n, #2
- bcc L(ed1) C n = 1
- ldr r8, [ap, #-4]!
- b L(md) C n = 3
-
-L(ev): ldr r6, [ap, #-4]!
- subs n, n, #2
- beq L(ed) C n = 3
- C n = 4
-L(tp): ldr r8, [ap, #-4]!
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(md): ldr r6, [ap, #-4]!
- orr r7, r7, r8, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r8, lsl cnt
-
-L(ed): orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(ed1): str r7, [rp, #-4]
- mov r0, r4, lsr tnc
-')
-ifdef(`OPERATION_rshift',`
- ldr r4, [ap]
- rsb tnc, cnt, #32
-
- mov r7, r4, lsr cnt
- tst n, #1
- beq L(ev) C n even
-
-L(od): subs n, n, #2
- bcc L(ed1) C n = 1
- ldr r8, [ap, #4]!
- b L(md) C n = 3
-
-L(ev): ldr r6, [ap, #4]!
- subs n, n, #2
- beq L(ed) C n = 2
- C n = 4
-
-L(tp): ldr r8, [ap, #4]!
- orr r7, r7, r6, lsl tnc
- str r7, [rp], #4
- mov r7, r6, lsr cnt
-L(md): ldr r6, [ap, #4]!
- orr r7, r7, r8, lsl tnc
- str r7, [rp], #4
- mov r7, r8, lsr cnt
-
-L(ed): orr r7, r7, r6, lsl tnc
- str r7, [rp], #4
- mov r7, r6, lsr cnt
-L(ed1): str r7, [rp], #4
- mov r0, r4, lsl tnc
-')
- pop {r4, r6, r7, r8}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/neon/lshiftc.asm b/gmp/mpn/arm/neon/lshiftc.asm
deleted file mode 100644
index 9e4096256d..0000000000
--- a/gmp/mpn/arm/neon/lshiftc.asm
+++ /dev/null
@@ -1,257 +0,0 @@
-dnl ARM Neon mpn_lshiftc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C StrongARM - -
-C XScale - -
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 3.5 3.5 Y
-C Cortex-A15 1.75 1.75 Y
-
-
-C We read 64 bits at a time at 32-bit aligned addresses, and except for the
-C first and last store, we write using 64-bit aligned addresses. All shifting
-C is done on 64-bit words in 'extension' registers.
-C
-C It should be possible to read also using 64-bit alignment, by manipulating
-C the shift count for unaligned operands. Not done, since it does not seem to
-C matter for A9 or A15.
-C
-C This will not work in big-endian mode.
-
-C TODO
-C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
-C which might make it tricky.
-C * Clean up and simplify.
-C * Consider sharing most of the code for lshift and rshift, since the feed-in
-C code, the loop, and most of the wind-down code are identical.
-C * Replace the basecase code with code using 'extension' registers.
-C * Optimise. It is not clear that this loop insn permutation is optimal for
-C either A9 or A15.
-
-C INPUT PARAMETERS
-define(`rp', `r0')
-define(`ap', `r1')
-define(`n', `r2')
-define(`cnt', `r3')
-
- define(`IFLSH', `$1')
- define(`IFRSH', `')
- define(`X',`0')
- define(`Y',`1')
- define(`func',`mpn_lshiftc')
-define(`OPERATION_lshiftc',1)
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshiftc)
-IFLSH(` mov r12, n, lsl #2 ')
-IFLSH(` add rp, rp, r12 ')
-IFLSH(` add ap, ap, r12 ')
-
- cmp n, #4 C SIMD code n limit
- ble L(base)
-
-ifdef(`OPERATION_lshiftc',`
- vdup.32 d6, r3 C left shift count is positive
- sub r3, r3, #64 C right shift count is negative
- vdup.32 d7, r3
- mov r12, #-8') C lshift pointer update offset
-ifdef(`OPERATION_rshift',`
- rsb r3, r3, #0 C right shift count is negative
- vdup.32 d6, r3
- add r3, r3, #64 C left shift count is positive
- vdup.32 d7, r3
- mov r12, #8') C rshift pointer update offset
-
-IFLSH(` sub ap, ap, #8 ')
- vld1.32 {d19}, [ap], r12 C load initial 2 limbs
- vshl.u64 d18, d19, d7 C retval
-
- tst rp, #4 C is rp 64-bit aligned already?
- beq L(rp_aligned) C yes, skip
- vmvn d19, d19
-IFLSH(` add ap, ap, #4 ') C move back ap pointer
-IFRSH(` sub ap, ap, #4 ') C move back ap pointer
- vshl.u64 d4, d19, d6
- sub n, n, #1 C first limb handled
-IFLSH(` sub rp, rp, #4 ')
- vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
- vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
-
-L(rp_aligned):
-IFLSH(` sub rp, rp, #8 ')
- subs n, n, #6
- vmvn d19, d19
- blt L(two_or_three_more)
- tst n, #2
- beq L(2)
-
-L(1): vld1.32 {d17}, [ap], r12
- vshl.u64 d5, d19, d6
- vmvn d17, d17
- vld1.32 {d16}, [ap], r12
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- sub n, n, #2
- b L(mid)
-
-L(2): vld1.32 {d16}, [ap], r12
- vshl.u64 d4, d19, d6
- vmvn d16, d16
- vld1.32 {d17}, [ap], r12
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- subs n, n, #4
- blt L(end)
-
-L(top): vmvn d17, d17
- vld1.32 {d16}, [ap], r12
- vorr d2, d4, d1
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- vst1.32 {d2}, [rp:64], r12
-L(mid): vmvn d16, d16
- vld1.32 {d17}, [ap], r12
- vorr d3, d5, d0
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- vst1.32 {d3}, [rp:64], r12
- subs n, n, #4
- bge L(top)
-
-L(end): tst n, #1
- beq L(evn)
-
- vorr d2, d4, d1
- vst1.32 {d2}, [rp:64], r12
- b L(cj1)
-
-L(evn): vmvn d17, d17
- vorr d2, d4, d1
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- vst1.32 {d2}, [rp:64], r12
- vmvn.u8 d17, #0
- vorr d2, d5, d0
- vshl.u64 d0, d17, d7
- vorr d3, d4, d0
- b L(cj2)
-
-C Load last 2 - 3 limbs, store last 4 - 5 limbs
-L(two_or_three_more):
- tst n, #1
- beq L(l2)
-
-L(l3): vshl.u64 d5, d19, d6
- vld1.32 {d17}, [ap], r12
-L(cj1): vmov.u8 d16, #0
-IFLSH(` add ap, ap, #4 ')
- vmvn d17, d17
- vld1.32 {d16[Y]}, [ap], r12
- vshl.u64 d0, d17, d7
- vshl.u64 d4, d17, d6
- vmvn d16, d16
- vorr d3, d5, d0
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- vst1.32 {d3}, [rp:64], r12
- vorr d2, d4, d1
- vst1.32 {d2}, [rp:64], r12
-IFLSH(` add rp, rp, #4 ')
- vst1.32 {d5[Y]}, [rp]
- vmov.32 r0, d18[X]
- bx lr
-
-L(l2): vld1.32 {d16}, [ap], r12
- vshl.u64 d4, d19, d6
- vmvn d16, d16
- vshl.u64 d1, d16, d7
- vshl.u64 d5, d16, d6
- vmvn.u8 d17, #0
- vorr d2, d4, d1
- vshl.u64 d0, d17, d7
- vorr d3, d5, d0
-L(cj2): vst1.32 {d2}, [rp:64], r12
- vst1.32 {d3}, [rp]
- vmov.32 r0, d18[X]
- bx lr
-
-
-define(`tnc', `r12')
-L(base):
- push {r4, r6, r7, r8}
- ldr r4, [ap, #-4]!
- rsb tnc, cnt, #32
- mvn r6, r4
-
- mov r7, r6, lsl cnt
- tst n, #1
- beq L(ev) C n even
-
-L(od): subs n, n, #2
- bcc L(ed1) C n = 1
- ldr r8, [ap, #-4]!
- mvn r8, r8
- b L(md) C n = 3
-
-L(ev): ldr r6, [ap, #-4]!
- mvn r6, r6
- subs n, n, #2
- beq L(ed) C n = 3
- C n = 4
-L(tp): ldr r8, [ap, #-4]!
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mvn r8, r8
- mov r7, r6, lsl cnt
-L(md): ldr r6, [ap, #-4]!
- orr r7, r7, r8, lsr tnc
- str r7, [rp, #-4]!
- mvn r6, r6
- mov r7, r8, lsl cnt
-
-L(ed): orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]!
- mov r7, r6, lsl cnt
-L(ed1): mvn r6, #0
- orr r7, r7, r6, lsr tnc
- str r7, [rp, #-4]
- mov r0, r4, lsr tnc
- pop {r4, r6, r7, r8}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/neon/popcount.asm b/gmp/mpn/arm/neon/popcount.asm
deleted file mode 100644
index 2f8f9afc8d..0000000000
--- a/gmp/mpn/arm/neon/popcount.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl ARM Neon mpn_popcount -- mpn bit population count.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.125
-C Cortex-A15 0.56
-
-C TODO
-C * Explore using vldr and vldm. Does it help on A9? (These loads do
-C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
-C popcount. Except perhaps also for popcount for the edge loads.)
-C * Arrange to align the pointer, if that helps performance. Use the same
-C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
-C valgrind!)
-C * Explore if explicit align directives, e.g., "[ptr:128]" help.
-C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
-
-C INPUT PARAMETERS
-define(`ap', r0)
-define(`n', r1)
-
-C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
-C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
-C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
-C can be represented as a 8-bit ARM constant.
-C
-define(`chunksize',0x3f80)
-
-ASM_START()
-PROLOGUE(mpn_popcount)
-
- cmp n, #chunksize
- bhi L(gt16k)
-
-L(lt16k):
- vmov.i64 q8, #0 C clear summation register
- vmov.i64 q9, #0 C clear summation register
-
- tst n, #1
- beq L(xxx0)
- vmov.i64 d0, #0
- sub n, n, #1
- vld1.32 {d0[0]}, [ap]! C load 1 limb
- vcnt.8 d24, d0
- vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
-
-L(xxx0):tst n, #2
- beq L(xx00)
- sub n, n, #2
- vld1.32 {d0}, [ap]! C load 2 limbs
- vcnt.8 d24, d0
- vpadal.u8 d16, d24
-
-L(xx00):tst n, #4
- beq L(x000)
- sub n, n, #4
- vld1.32 {q0}, [ap]! C load 4 limbs
- vcnt.8 q12, q0
- vpadal.u8 q8, q12
-
-L(x000):tst n, #8
- beq L(0000)
-
- subs n, n, #8
- vld1.32 {q0,q1}, [ap]! C load 8 limbs
- bls L(sum)
-
-L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
- sub n, n, #8
- vcnt.8 q12, q0
- vcnt.8 q13, q1
- b L(mid)
-
-L(0000):subs n, n, #16
- blo L(e0)
-
- vld1.32 {q2,q3}, [ap]! C load 8 limbs
- vld1.32 {q0,q1}, [ap]! C load 8 limbs
- vcnt.8 q12, q2
- vcnt.8 q13, q3
- subs n, n, #16
- blo L(end)
-
-L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
- vpadal.u8 q8, q12
- vcnt.8 q12, q0
- vpadal.u8 q9, q13
- vcnt.8 q13, q1
-L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
- subs n, n, #16
- vpadal.u8 q8, q12
- vcnt.8 q12, q2
- vpadal.u8 q9, q13
- vcnt.8 q13, q3
- bhs L(top)
-
-L(end): vpadal.u8 q8, q12
- vpadal.u8 q9, q13
-L(sum): vcnt.8 q12, q0
- vcnt.8 q13, q1
- vpadal.u8 q8, q12
- vpadal.u8 q9, q13
- vadd.i16 q8, q8, q9
- C we have 8 16-bit counts
-L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
- vpaddl.u32 q8, q8 C we have 2 64-bit counts
- vmov.32 r0, d16[0]
- vmov.32 r1, d17[0]
- add r0, r0, r1
- bx lr
-
-C Code for large count. Splits operand and calls above code.
-define(`ap2', r2) C caller-saves reg not used above
-L(gt16k):
- push {r4,r14}
- mov ap2, ap
- mov r3, n C full count
- mov r4, #0 C total sum
-
-1: mov n, #chunksize C count for this invocation
- bl L(lt16k) C could jump deep inside code
- add ap2, ap2, #chunksize*4 C point at next chunk
- add r4, r4, r0
- mov ap, ap2 C put chunk pointer in place for call
- sub r3, r3, #chunksize
- cmp r3, #chunksize
- bhi 1b
-
- mov n, r3 C count for final invocation
- bl L(lt16k)
- add r0, r4, r0
- pop {r4,pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/neon/sec_tabselect.asm b/gmp/mpn/arm/neon/sec_tabselect.asm
deleted file mode 100644
index 69fceb0063..0000000000
--- a/gmp/mpn/arm/neon/sec_tabselect.asm
+++ /dev/null
@@ -1,140 +0,0 @@
-dnl ARM Neon mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.15
-C Cortex-A15 0.65
-
-define(`rp', `r0')
-define(`tp', `r1')
-define(`n', `r2')
-define(`nents', `r3')
-C define(`which', on stack)
-
-define(`i', `r4')
-define(`j', `r5')
-
-define(`maskq', `q10')
-define(`maskd', `d20')
-
-ASM_START()
-PROLOGUE(mpn_sec_tabselect)
- push {r4-r5}
-
- add r4, sp, #8
- vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies
- vmov.i32 q14, #1 C 4 copies of 1
-
- subs j, n, #8
- bmi L(outer_end)
-
-L(outer_top):
- mov i, nents
- mov r12, tp C preserve tp
- veor q13, q13, q13 C 4 counter copies
- veor q2, q2, q2
- veor q3, q3, q3
- ALIGN(16)
-L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies
- vld1.32 {q0,q1}, [tp]
- vadd.i32 q13, q13, q14
- vbit q2, q0, maskq
- vbit q3, q1, maskq
- add tp, tp, n, lsl #2
- subs i, i, #1
- bne L(top)
- vst1.32 {q2,q3}, [rp]!
- add tp, r12, #32 C restore tp, point to next slice
- subs j, j, #8
- bpl L(outer_top)
-L(outer_end):
-
- tst n, #4
- beq L(b0xx)
-L(b1xx):mov i, nents
- mov r12, tp
- veor q13, q13, q13
- veor q2, q2, q2
- ALIGN(16)
-L(tp4): vceq.i32 maskq, q13, q15
- vld1.32 {q0}, [tp]
- vadd.i32 q13, q13, q14
- vbit q2, q0, maskq
- add tp, tp, n, lsl #2
- subs i, i, #1
- bne L(tp4)
- vst1.32 {q2}, [rp]!
- add tp, r12, #16
-
-L(b0xx):tst n, #2
- beq L(b00x)
-L(b01x):mov i, nents
- mov r12, tp
- veor d26, d26, d26
- veor d4, d4, d4
- ALIGN(16)
-L(tp2): vceq.i32 maskd, d26, d30
- vld1.32 {d0}, [tp]
- vadd.i32 d26, d26, d28
- vbit d4, d0, maskd
- add tp, tp, n, lsl #2
- subs i, i, #1
- bne L(tp2)
- vst1.32 {d4}, [rp]!
- add tp, r12, #8
-
-L(b00x):tst n, #1
- beq L(b000)
-L(b001):mov i, nents
- mov r12, tp
- veor d26, d26, d26
- veor d4, d4, d4
- ALIGN(16)
-L(tp1): vceq.i32 maskd, d26, d30
- vld1.32 {d0[0]}, [tp]
- vadd.i32 d26, d26, d28
- vbit d4, d0, maskd
- add tp, tp, n, lsl #2
- subs i, i, #1
- bne L(tp1)
- vst1.32 {d4[0]}, [rp]
-
-L(b000):pop {r4-r5}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/rsh1aors_n.asm b/gmp/mpn/arm/rsh1aors_n.asm
deleted file mode 100644
index 95c1f79ad9..0000000000
--- a/gmp/mpn/arm/rsh1aors_n.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl ARM mpn_rsh1add_n and mpn_rsh1sub_n.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.64-3.7
-C Cortex-A15 2.5
-
-C TODO
-C * Not optimised.
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`RSTCY', `cmn $1, $1')
- define(`func', mpn_rsh1add_n)
- define(`func_nc', mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`RSTCY',
- `mvn $2, #0x80000000
- cmp $2, $1')
- define(`func', mpn_rsh1sub_n)
- define(`func_nc', mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
-
-ASM_START()
-PROLOGUE(func)
- push {r4-r11}
- ldr r4, [up], #4
- ldr r8, [vp], #4
- ADDSUB r4, r4, r8
- movs r12, r7, rrx
- and r11, r4, #1 C return value
- subs n, n, #4
- blo L(end)
-
-L(top): ldmia up!, {r5,r6,r7}
- ldmia vp!, {r8,r9,r10}
- cmn r12, r12
- ADDSUBC r5, r5, r8
- ADDSUBC r6, r6, r9
- ADDSUBC r7, r7, r10
- movs r12, r7, rrx
- movs r6, r6, rrx
- movs r5, r5, rrx
- movs r4, r4, rrx
- subs n, n, #3
- stmia rp!, {r4,r5,r6}
- mov r4, r7
- bhs L(top)
-
-L(end): cmn n, #2
- bls L(e2)
- ldm up, {r5,r6}
- ldm vp, {r8,r9}
- cmn r12, r12
- ADDSUBC r5, r5, r8
- ADDSUBC r6, r6, r9
- movs r12, r6, rrx
- movs r5, r5, rrx
- movs r4, r4, rrx
- stmia rp!, {r4,r5}
- mov r4, r6
- b L(e1)
-
-L(e2): bne L(e1)
- ldr r5, [up, #0]
- ldr r8, [vp, #0]
- cmn r12, r12
- ADDSUBC r5, r5, r8
- movs r12, r5, rrx
- movs r4, r4, rrx
- str r4, [rp], #4
- mov r4, r5
-
-L(e1): RSTCY( r12, r1)
- mov r4, r4, rrx
- str r4, [rp, #0]
- mov r0, r11
- pop {r4-r11}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/rshift.asm b/gmp/mpn/arm/rshift.asm
deleted file mode 100644
index 84728d038a..0000000000
--- a/gmp/mpn/arm/rshift.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-dnl ARM mpn_rshift.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.5
-C Cortex-A15 ?
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`cnt', `r3')
-define(`tnc', `r12')
-
-ASM_START()
-PROLOGUE(mpn_rshift)
- push {r4, r6, r7, r8}
- ldr r4, [up]
- rsb tnc, cnt, #32
-
- mov r7, r4, lsr cnt
- tst n, #1
- beq L(evn) C n even
-
-L(odd): subs n, n, #2
- bcc L(1) C n = 1
- ldr r8, [up, #4]!
- b L(mid)
-
-L(evn): ldr r6, [up, #4]!
- subs n, n, #2
- beq L(end)
-
-L(top): ldr r8, [up, #4]!
- orr r7, r7, r6, lsl tnc
- str r7, [rp], #4
- mov r7, r6, lsr cnt
-L(mid): ldr r6, [up, #4]!
- orr r7, r7, r8, lsl tnc
- str r7, [rp], #4
- mov r7, r8, lsr cnt
- subs n, n, #2
- bgt L(top)
-
-L(end): orr r7, r7, r6, lsl tnc
- str r7, [rp], #4
- mov r7, r6, lsr cnt
-L(1): str r7, [rp]
- mov r0, r4, lsl tnc
- pop {r4, r6, r7, r8}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/sec_tabselect.asm b/gmp/mpn/arm/sec_tabselect.asm
deleted file mode 100644
index 8cf937a091..0000000000
--- a/gmp/mpn/arm/sec_tabselect.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-dnl ARM mpn_sec_tabselect
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.33
-C Cortex-A15 2.2
-
-C TODO
-C * Consider using special code for small nents, either swapping the inner and
-C outer loops, or providing a few completely unrolling the inner loops.
-
-define(`rp', `r0')
-define(`tp', `r1')
-define(`n', `r2')
-define(`nents', `r3')
-C which on stack
-
-define(`i', `r11')
-define(`j', `r12')
-define(`c', `r14')
-define(`mask', `r7')
-
-ASM_START()
-PROLOGUE(mpn_sec_tabselect)
- push {r4-r11, r14}
-
- subs j, n, #3
- bmi L(outer_end)
-L(outer_top):
- ldr c, [sp, #36]
- mov i, nents
- push {tp}
-
- mov r8, #0
- mov r9, #0
- mov r10, #0
-
-L(top): subs c, c, #1
- ldm tp, {r4,r5,r6}
- sbc mask, mask, mask
- subs i, i, #1
- add tp, tp, n, lsl #2
- and r4, r4, mask
- and r5, r5, mask
- and r6, r6, mask
- orr r8, r8, r4
- orr r9, r9, r5
- orr r10, r10, r6
- bge L(top)
-
- stmia rp!, {r8,r9,r10}
- pop {tp}
- add tp, tp, #12
- subs j, j, #3
- bpl L(outer_top)
-L(outer_end):
-
- cmp j, #-1
- bne L(n2)
-
- ldr c, [sp, #36]
- mov i, nents
- mov r8, #0
- mov r9, #0
-L(tp2): subs c, c, #1
- sbc mask, mask, mask
- ldm tp, {r4,r5}
- subs i, i, #1
- add tp, tp, n, lsl #2
- and r4, r4, mask
- and r5, r5, mask
- orr r8, r8, r4
- orr r9, r9, r5
- bge L(tp2)
- stmia rp, {r8,r9}
- pop {r4-r11, r14}
- bx lr
-
-L(n2): cmp j, #-2
- bne L(n1)
-
- ldr c, [sp, #36]
- mov i, nents
- mov r8, #0
-L(tp1): subs c, c, #1
- sbc mask, mask, mask
- ldr r4, [tp]
- subs i, i, #1
- add tp, tp, n, lsl #2
- and r4, r4, mask
- orr r8, r8, r4
- bge L(tp1)
- str r8, [rp]
-L(n1): pop {r4-r11, r14}
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/sub_n.asm b/gmp/mpn/arm/sub_n.asm
new file mode 100644
index 0000000000..7063be4f13
--- /dev/null
+++ b/gmp/mpn/arm/sub_n.asm
@@ -0,0 +1,71 @@
+dnl ARM mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+dnl Contributed by Robert Harley.
+
+dnl Copyright 1997, 2000, 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This code runs at 5 cycles/limb.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`vp',`r2')
+define(`n',`r3')
+
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ stmfd sp!, { r8, r9, lr }
+ subs r12, r12, r12
+ tst n, #1
+ beq L(skip1)
+ ldr r12, [up], #4
+ ldr lr, [vp], #4
+ subs r12, r12, lr
+ str r12, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldmia up!, { r8, r9 }
+ ldmia vp!, { r12, lr }
+ sbcs r8, r8, r12
+ sbcs r9, r9, lr
+ stmia rp!, { r8, r9 }
+L(skip2):
+ bics n, n, #3
+ beq L(return)
+ stmfd sp!, { r4, r5, r6, r7 }
+L(sub_n_loop):
+ ldmia up!, { r4, r5, r6, r7 }
+ ldmia vp!, { r8, r9, r12, lr }
+ sbcs r4, r4, r8
+ ldr r8, [rp, #12] C cache allocate
+ sbcs r5, r5, r9
+ sbcs r6, r6, r12
+ sbcs r7, r7, lr
+ stmia rp!, { r4, r5, r6, r7 }
+ sub n, n, #4
+ teq n, #0
+ bne L(sub_n_loop)
+ ldmfd sp!, { r4, r5, r6, r7 }
+L(return):
+ sbc r0, r0, r0
+ and r0, r0, #1
+ ldmfd sp!, { r8, r9, pc }
+EPILOGUE(mpn_sub_n)
diff --git a/gmp/mpn/arm/submul_1.asm b/gmp/mpn/arm/submul_1.asm
new file mode 100644
index 0000000000..c3654377d7
--- /dev/null
+++ b/gmp/mpn/arm/submul_1.asm
@@ -0,0 +1,107 @@
+dnl ARM mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl result from a second limb vector.
+
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: 7.75-9.75 (dependent on vl value)
+C XScale: 8-9 (dependent on vl value, estimated)
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
+define(`vl',`r3')
+define(`rl',`r12')
+define(`ul',`r6')
+define(`r',`lr')
+
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ stmfd sp!, { r4-r6, lr }
+ subs r4, r0, r0 C clear r4, set cy
+ tst n, #1
+ beq L(skip1)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ umull r5, r4, ul, vl
+ subs r, rl, r5
+ str r, [rp], #4
+L(skip1):
+ tst n, #2
+ beq L(skip2)
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ ldr ul, [up], #4
+ sbcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ sbcs r, rl, r5
+ str r, [rp], #4
+L(skip2):
+ bics r, n, #3
+ beq L(return)
+
+ ldr ul, [up], #4
+ ldr rl, [rp, #0]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ b L(in)
+
+L(loop):
+ ldr ul, [up], #4
+ sbcs r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+L(in): ldr ul, [up], #4
+ sbcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ sbcs r, rl, r5
+ ldr rl, [rp, #4]
+ mov r5, #0
+ umlal r4, r5, ul, vl
+ str r, [rp], #4
+ ldr ul, [up], #4
+ sbcs r, rl, r4
+ ldr rl, [rp, #4]
+ mov r4, #0
+ umlal r5, r4, ul, vl
+ str r, [rp], #4
+ sub n, n, #4
+ bics r, n, #3
+ bne L(loop)
+
+ sbcs r, rl, r5
+ str r, [rp], #4
+L(return):
+ sbc r0, r0, r0
+ sub r0, r4, r0
+ ldmfd sp!, { r4-r6, pc }
+EPILOGUE(mpn_submul_1)
diff --git a/gmp/mpn/arm/udiv.asm b/gmp/mpn/arm/udiv.asm
index 8d441c74ed..9434a4f2b6 100644
--- a/gmp/mpn/arm/udiv.asm
+++ b/gmp/mpn/arm/udiv.asm
@@ -1,33 +1,22 @@
dnl ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor.
dnl Return quotient and store remainder through a supplied pointer.
-dnl Copyright 2001, 2012 Free Software Foundation, Inc.
+dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -59,9 +48,9 @@ L(oop): divstep(n1,n0,d)
teq r12, #0
bne L(oop)
- str n1, [rem_ptr] C store remainder
+ str n1, [ rem_ptr ] C store remainder
adc r0, n0, n0 C quotient: add last carry from divstep
- bx lr
+ mov pc, lr
L(_large_divisor):
stmfd sp!, { r8, lr }
@@ -98,7 +87,7 @@ L(oop2):
addcs n0, n0, #1 C adjust quotient
L(_even_divisor):
- str n1, [rem_ptr] C store remainder
+ str n1, [ rem_ptr ] C store remainder
mov r0, n0 C quotient
ldmfd sp!, { r8, pc }
EPILOGUE(mpn_udiv_qrnnd)
diff --git a/gmp/mpn/arm/v5/gcd_1.asm b/gmp/mpn/arm/v5/gcd_1.asm
deleted file mode 100644
index 169d154bf0..0000000000
--- a/gmp/mpn/arm/v5/gcd_1.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-dnl ARM v5 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C StrongARM -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.9
-C Cortex-A15 ?
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop better.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `r0')
-define(`n', `r1')
-define(`v0', `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push {r4, r7, lr}
- ldr r3, [up] C U low limb
-
- orr r3, r3, v0
- rsb r4, r3, #0
- and r4, r4, r3
- clz r4, r4 C min(ctz(u0),ctz(v0))
- rsb r4, r4, #31
-
- rsb r12, v0, #0
- and r12, r12, v0
- clz r12, r12
- rsb r12, r12, #31
- mov v0, v0, lsr r12
-
- mov r7, v0
-
- cmp n, #1
- bne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr r3, [up]
- cmp v0, r3, lsr #BMOD_THRES_LOG2
- bhi L(red1)
-
-L(bmod):mov r3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- blo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov r3, r0
-L(red1):rsbs r12, r3, #0
- and r12, r12, r3
- clz r12, r12
- rsb r12, r12, #31
- bne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top): rsb r12, r12, #31
- movcc r3, r1 C if x-y < 0
- movcc r7, r0 C use x,y-x
-L(mid): mov r3, r3, lsr r12 C
- mov r0, r3 C
- sub r1, r7, r3 C
- rsbs r3, r7, r3 C
- and r12, r1, r3 C
- clz r12, r12 C
- bne L(top) C
-
-L(end): mov r0, r7, lsl r4
- pop {r4, r7, pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_1.asm b/gmp/mpn/arm/v5/mod_1_1.asm
deleted file mode 100644
index 3cf0cd7763..0000000000
--- a/gmp/mpn/arm/v5/mod_1_1.asm
+++ /dev/null
@@ -1,129 +0,0 @@
-dnl ARM mpn_mod_1_1p
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 7
-C Cortex-A15 6
-
-define(`ap', `r0')
-define(`n', `r1')
-define(`d', `r2')
-define(`cps',`r3')
-
-ASM_START()
-PROLOGUE(mpn_mod_1_1p)
- push {r4-r10}
- add r0, r0, r1, asl #2
- ldr r5, [r0, #-4]!
- ldr r12, [r0, #-4]!
- subs r1, r1, #2
- ble L(4)
- ldr r8, [r3, #12]
- mov r4, r12
- mov r10, r5
- umull r7, r5, r10, r8
- sub r1, r1, #1
- b L(mid)
-
-L(top): adds r12, r6, r7
- adcs r10, r4, r5
- sub r1, r1, #1
- mov r6, #0
- movcs r6, r8
- umull r7, r5, r10, r8
- adds r4, r12, r6
- subcs r4, r4, r2
-L(mid): ldr r6, [r0, #-4]!
- teq r1, #0
- bne L(top)
-
- adds r12, r6, r7
- adcs r5, r4, r5
- subcs r5, r5, r2
-L(4): ldr r1, [r3, #4]
- cmp r1, #0
- beq L(7)
- ldr r4, [r3, #8]
- umull r0, r6, r5, r4
- adds r12, r0, r12
- addcs r6, r6, #1
- rsb r0, r1, #32
- mov r0, r12, lsr r0
- orr r5, r0, r6, asl r1
- mov r12, r12, asl r1
- b L(8)
-L(7): cmp r5, r2
- subcs r5, r5, r2
-L(8): ldr r0, [r3, #0]
- umull r4, r3, r5, r0
- add r5, r5, #1
- adds r0, r4, r12
- adc r5, r3, r5
- mul r5, r2, r5
- sub r12, r12, r5
- cmp r12, r0
- addhi r12, r12, r2
- cmp r2, r12
- subls r12, r12, r2
- mov r0, r12, lsr r1
- pop {r4-r10}
- bx r14
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
- stmfd sp!, {r4, r5, r6, r14}
- mov r5, r0
- clz r4, r1
- mov r0, r1, asl r4
- rsb r6, r0, #0
- bl mpn_invert_limb
- str r0, [r5, #0]
- str r4, [r5, #4]
- cmp r4, #0
- beq L(2)
- rsb r1, r4, #32
- mov r3, #1
- mov r3, r3, asl r4
- orr r3, r3, r0, lsr r1
- mul r3, r6, r3
- mov r4, r3, lsr r4
- str r4, [r5, #8]
-L(2): mul r0, r6, r0
- str r0, [r5, #12]
- ldmfd sp!, {r4, r5, r6, pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/v5/mod_1_2.asm b/gmp/mpn/arm/v5/mod_1_2.asm
deleted file mode 100644
index aa26ecb21c..0000000000
--- a/gmp/mpn/arm/v5/mod_1_2.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-dnl ARM mpn_mod_1s_2p
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 4.25
-C Cortex-A15 3
-
-define(`ap', `r0')
-define(`n', `r1')
-define(`d', `r2')
-define(`cps',`r3')
-
-ASM_START()
-PROLOGUE(mpn_mod_1s_2p)
- push {r4-r10}
- tst n, #1
- add r7, r3, #8
- ldmia r7, {r7, r8, r12} C load B1, B2, B3
- add ap, ap, n, lsl #2 C put ap at operand end
- beq L(evn)
-
-L(odd): subs n, n, #1
- beq L(1)
- ldmdb ap!, {r4,r6,r9}
- mov r10, #0
- umlal r4, r10, r6, r7
- umlal r4, r10, r9, r8
- b L(com)
-
-L(evn): ldmdb ap!, {r4,r10}
-L(com): subs n, n, #2
- ble L(end)
- ldmdb ap!, {r5,r6}
- b L(mid)
-
-L(top): mov r9, #0
- umlal r5, r9, r6, r7 C B1
- umlal r5, r9, r4, r8 C B2
- ldmdb ap!, {r4,r6}
- umlal r5, r9, r10, r12 C B3
- ble L(xit)
- mov r10, #0
- umlal r4, r10, r6, r7 C B1
- umlal r4, r10, r5, r8 C B2
- ldmdb ap!, {r5,r6}
- umlal r4, r10, r9, r12 C B3
-L(mid): subs n, n, #4
- bge L(top)
-
- mov r9, #0
- umlal r5, r9, r6, r7 C B1
- umlal r5, r9, r4, r8 C B2
- umlal r5, r9, r10, r12 C B3
- mov r4, r5
-
-L(end): movge r9, r10 C executed iff coming via xit
- ldr r6, [r3, #4] C cps[1] = cnt
- mov r5, #0
- umlal r4, r5, r9, r7
- mov r7, r5, lsl r6
-L(x): rsb r1, r6, #32
- orr r8, r7, r4, lsr r1
- mov r9, r4, lsl r6
- ldr r5, [r3, #0]
- add r0, r8, #1
- umull r12, r1, r8, r5
- adds r4, r12, r9
- adc r1, r1, r0
- mul r5, r2, r1
- sub r9, r9, r5
- cmp r9, r4
- addhi r9, r9, r2
- cmp r2, r9
- subls r9, r9, r2
- mov r0, r9, lsr r6
- pop {r4-r10}
- bx r14
-
-L(xit): mov r10, #0
- umlal r4, r10, r6, r7 C B1
- umlal r4, r10, r5, r8 C B2
- umlal r4, r10, r9, r12 C B3
- b L(end)
-
-L(1): ldr r6, [r3, #4] C cps[1] = cnt
- ldr r4, [ap, #-4] C ap[0]
- mov r7, #0
- b L(x)
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1s_2p_cps)
- push {r4-r8, r14}
- clz r4, r1
- mov r5, r1, lsl r4 C b <<= cnt
- mov r6, r0 C r6 = cps
- mov r0, r5
- bl mpn_invert_limb
- rsb r3, r4, #32
- mov r3, r0, lsr r3
- mov r2, #1
- orr r3, r3, r2, lsl r4
- rsb r1, r5, #0
- mul r2, r1, r3
- umull r3, r12, r2, r0
- add r12, r2, r12
- mvn r12, r12
- mul r1, r5, r12
- cmp r1, r3
- addhi r1, r1, r5
- umull r12, r7, r1, r0
- add r7, r1, r7
- mvn r7, r7
- mul r3, r5, r7
- cmp r3, r12
- addhi r3, r3, r5
- mov r5, r2, lsr r4
- mov r7, r1, lsr r4
- mov r8, r3, lsr r4
- stmia r6, {r0,r4,r5,r7,r8} C fill cps
- pop {r4-r8, pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_1.asm b/gmp/mpn/arm/v6/addmul_1.asm
deleted file mode 100644
index 57019e4b2b..0000000000
--- a/gmp/mpn/arm/v6/addmul_1.asm
+++ /dev/null
@@ -1,111 +0,0 @@
-dnl ARM mpn_addmul_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.25
-C Cortex-A15 4
-
-C TODO
-C * Micro-optimise feed-in code.
-C * Optimise for n=1,2 by delaying register saving.
-C * Try using ldm/stm.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`v0',`r3')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- stmfd sp!, { r4, r5, r6, r7 }
-
- ands r6, n, #3
- mov r12, #0
- beq L(fi0)
- cmp r6, #2
- bcc L(fi1)
- beq L(fi2)
-
-L(fi3): ldr r4, [up], #4
- ldr r6, [rp, #0]
- ldr r5, [up], #4
- b L(lo3)
-
-L(fi0): ldr r5, [up], #4
- ldr r7, [rp], #4
- ldr r4, [up], #4
- b L(lo0)
-
-L(fi1): ldr r4, [up], #4
- ldr r6, [rp], #8
- subs n, n, #1
- beq L(1)
- ldr r5, [up], #4
- b L(lo1)
-
-L(fi2): ldr r5, [up], #4
- ldr r7, [rp], #12
- ldr r4, [up], #4
- b L(lo2)
-
- ALIGN(16)
-L(top): ldr r6, [rp, #-8]
- ldr r5, [up], #4
- str r7, [rp, #-12]
-L(lo1): umaal r6, r12, r4, v0
- ldr r7, [rp, #-4]
- ldr r4, [up], #4
- str r6, [rp, #-8]
-L(lo0): umaal r7, r12, r5, v0
- ldr r6, [rp, #0]
- ldr r5, [up], #4
- str r7, [rp, #-4]
-L(lo3): umaal r6, r12, r4, v0
- ldr r7, [rp, #4]
- ldr r4, [up], #4
- str r6, [rp], #16
-L(lo2): umaal r7, r12, r5, v0
- subs n, n, #4
- bhi L(top)
-
- ldr r6, [rp, #-8]
- str r7, [rp, #-12]
-L(1): umaal r6, r12, r4, v0
- str r6, [rp, #-8]
- mov r0, r12
- ldmfd sp!, { r4, r5, r6, r7 }
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_2.asm b/gmp/mpn/arm/v6/addmul_2.asm
deleted file mode 100644
index 69817ce340..0000000000
--- a/gmp/mpn/arm/v6/addmul_2.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-dnl ARM mpn_addmul_2.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.25
-C Cortex-A15 2.5
-
-C This is believed to be optimal for A15 for any unrolling, and optimal for A9
-C for 4-way unrolling. Using separate pointer update instructions is necessary
-C for optimal A9 speed.
-
-C TODO:
-C * Start the first multiply or multiplies directly at function entry.
-
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`vp',`r3')
-
-define(`v0',`r6')
-define(`v1',`r7')
-define(`u0',`r3')
-define(`u1',`r9')
-
-define(`cya',`r8')
-define(`cyb',`r12')
-
-
-ASM_START()
-PROLOGUE(mpn_addmul_2)
- push { r4, r5, r6, r7, r8, r9 }
-
- ldm vp, { v0, v1 }
- mov cya, #0
- mov cyb, #0
-
- tst n, #1
- beq L(evn)
-
-L(odd): ldr r5, [rp, #0]
- ldr u0, [up, #0]
- ldr r4, [rp, #4]
- tst n, #2
- beq L(fi1)
-L(fi3): sub up, up, #12
- sub rp, rp, #12
- b L(lo3)
-L(fi1): sub n, n, #1
- sub up, up, #4
- sub rp, rp, #4
- b L(lo1)
-
-L(evn): ldr r4, [rp, #0]
- ldr u1, [up, #0]
- ldr r5, [rp, #4]
- tst n, #2
- bne L(fi2)
-L(fi0): sub up, up, #8
- sub rp, rp, #8
- b L(lo0)
-L(fi2): subs n, n, #2
- bls L(end)
-
- ALIGN(16)
-L(top): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #0]
- ldr r4, [rp, #8]
- umaal r5, cyb, u1, v1
-L(lo1): ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #4]
- ldr r5, [rp, #12]
- umaal r4, cyb, u0, v1
-L(lo0): ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #8]
- ldr r4, [rp, #16]
- umaal r5, cyb, u1, v1
-L(lo3): ldr u1, [up, #16]
- umaal r5, cya, u0, v0
- str r5, [rp, #12]
- ldr r5, [rp, #20]
- add rp, rp, #16
- umaal r4, cyb, u0, v1
- add up, up, #16
- subs n, n, #4
- bhi L(top)
-
-L(end): umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #0]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #4]
- str cya, [rp, #8]
- mov r0, cyb
-
- pop { r4, r5, r6, r7, r8, r9 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/addmul_3.asm b/gmp/mpn/arm/v6/addmul_3.asm
deleted file mode 100644
index 046543020f..0000000000
--- a/gmp/mpn/arm/v6/addmul_3.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-dnl ARM mpn_addmul_3.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.125
-C Cortex-A15 2
-
-C TODO
-C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
-C avoiding the current multiply.
-C * Start the first multiply or multiplies early.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`vp',`r3')
-
-define(`v0',`r4') define(`v1',`r5') define(`v2',`r6')
-define(`u0',`r3') define(`u1',`r14')
-define(`w0',`r7') define(`w1',`r8') define(`w2',`r9')
-define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12')
-
-
-ASM_START()
-PROLOGUE(mpn_addmul_3)
- push { r4-r11, r14 }
-
- ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32
- ldm vp, { v0,v1,v2 }
- mov cy0, #0
- mov cy1, #0
- mov cy2, #0
-
-C Tricky n mod 6
- mul w0, w0, n C n * 3^{-1} mod 2^32
- and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2
- sub n, n, #3
-ifdef(`PIC',`
- add pc, pc, w0, ror $28
- nop
- b L(b0)
- b L(b2)
- b L(b4)
- .word 0xe7f000f0 C udf
- b L(b3)
- b L(b5)
- b L(b1)
-',`
- ldr pc, [pc, w0, ror $28]
- nop
- .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
-')
-
-L(b5): add up, up, #-8
- ldr w1, [rp, #0]
- ldr w2, [rp, #4]
- ldr u1, [up, #8]
- b L(lo5)
-
-L(b4): add rp, rp, #-4
- add up, up, #-12
- ldr w2, [rp, #4]
- ldr w0, [rp, #8]
- ldr u0, [up, #12]
- b L(lo4)
-
-L(b3): add rp, rp, #-8
- add up, up, #-16
- ldr w0, [rp, #8]
- ldr w1, [rp, #12]
- ldr u1, [up, #16]
- b L(lo3)
-
-L(b1): add rp, rp, #8
- ldr w2, [rp, #-8]
- ldr w0, [rp, #-4]
- ldr u1, [up, #0]
- b L(lo1)
-
-L(b0): add rp, rp, #4
- add up, up, #-4
- ldr w0, [rp, #-4]
- ldr w1, [rp, #0]
- ldr u0, [up, #4]
- b L(lo0)
-
-L(b2): add rp, rp, #12
- add up, up, #4
- ldr w1, [rp, #-12]
- ldr w2, [rp, #-8]
- ldr u0, [up, #-4]
-
- ALIGN(16)
-L(top): ldr w0, [rp, #-4]
- umaal w1, cy0, u0, v0
- ldr u1, [up, #0]
- umaal w2, cy1, u0, v1
- str w1, [rp, #-12]
- umaal w0, cy2, u0, v2
-L(lo1): ldr w1, [rp, #0]
- umaal w2, cy0, u1, v0
- ldr u0, [up, #4]
- umaal w0, cy1, u1, v1
- str w2, [rp, #-8]
- umaal w1, cy2, u1, v2
-L(lo0): ldr w2, [rp, #4]
- umaal w0, cy0, u0, v0
- ldr u1, [up, #8]
- umaal w1, cy1, u0, v1
- str w0, [rp, #-4]
- umaal w2, cy2, u0, v2
-L(lo5): ldr w0, [rp, #8]
- umaal w1, cy0, u1, v0
- ldr u0, [up, #12]
- umaal w2, cy1, u1, v1
- str w1, [rp, #0]
- umaal w0, cy2, u1, v2
-L(lo4): ldr w1, [rp, #12]
- umaal w2, cy0, u0, v0
- ldr u1, [up, #16]
- umaal w0, cy1, u0, v1
- str w2, [rp, #4]
- umaal w1, cy2, u0, v2
-L(lo3): ldr w2, [rp, #16]
- umaal w0, cy0, u1, v0
- ldr u0, [up, #20]
- umaal w1, cy1, u1, v1
- str w0, [rp, #8]
- umaal w2, cy2, u1, v2
-L(lo2): subs n, n, #6
- add up, up, #24
- add rp, rp, #24
- bge L(top)
-
-L(end): umaal w1, cy0, u0, v0
- ldr u1, [up, #0]
- umaal w2, cy1, u0, v1
- str w1, [rp, #-12]
- mov w0, #0
- umaal w0, cy2, u0, v2
- umaal w2, cy0, u1, v0
- umaal w0, cy1, u1, v1
- str w2, [rp, #-8]
- umaal cy1, cy2, u1, v2
- adds w0, w0, cy0
- str w0, [rp, #-4]
- adcs w1, cy1, #0
- str w1, [rp, #0]
- adc r0, cy2, #0
-
- pop { r4-r11, pc }
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/dive_1.asm b/gmp/mpn/arm/v6/dive_1.asm
deleted file mode 100644
index 92de81473f..0000000000
--- a/gmp/mpn/arm/v6/dive_1.asm
+++ /dev/null
@@ -1,149 +0,0 @@
-dnl ARM v6 mpn_divexact_1
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C norm unorm modexact_1c_odd
-C StrongARM - -
-C XScale - -
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 9 10 9
-C Cortex-A15 7 7 7
-
-C Architecture requirements:
-C v5 -
-C v5t clz
-C v5te -
-C v6 umaal
-C v6t2 -
-C v7a -
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`d', `r3')
-
-define(`cy', `r7')
-define(`cnt', `r6')
-define(`tnc', `r10')
-
-ASM_START()
-PROLOGUE(mpn_divexact_1)
- push {r4,r5,r6,r7,r8,r9}
-
- tst d, #1
-
- rsb r4, d, #0
- and r4, r4, d
- clz r4, r4
- rsb cnt, r4, #31 C count_trailing_zeros
- mov d, d, lsr cnt
-
-C binvert limb
- LEA( r4, binvert_limb_table)
- and r12, d, #254
- ldrb r4, [r4, r12, lsr #1]
- mul r12, r4, r4
- mul r12, d, r12
- rsb r12, r12, r4, lsl #1
- mul r4, r12, r12
- mul r4, d, r4
- rsb r4, r4, r12, lsl #1 C r4 = inverse
-
- ldr r5, [up], #4 C up[0]
- mov cy, #0
- rsb r8, r4, #0 C r8 = -inverse
- beq L(unnorm)
-
-L(norm):
- subs n, n, #1
- mul r5, r5, r4
- beq L(end)
-
- ALIGN(16)
-L(top): ldr r9, [up], #4
- mov r12, #0
- str r5, [rp], #4
- umaal r12, cy, r5, d
- mul r5, r9, r4
- mla r5, cy, r8, r5
- subs n, n, #1
- bne L(top)
-
-L(end): str r5, [rp]
- pop {r4,r5,r6,r7,r8,r9}
- bx r14
-
-L(unnorm):
- push {r10,r11}
- rsb tnc, cnt, #32
- mov r11, r5, lsr cnt
- subs n, n, #1
- beq L(edx)
-
- ldr r12, [up], #4
- orr r9, r11, r12, lsl tnc
- mov r11, r12, lsr cnt
- mul r5, r9, r4
- subs n, n, #1
- beq L(edu)
-
- ALIGN(16)
-L(tpu): ldr r12, [up], #4
- orr r9, r11, r12, lsl tnc
- mov r11, r12, lsr cnt
- mov r12, #0
- str r5, [rp], #4
- umaal r12, cy, r5, d
- mul r5, r9, r4
- mla r5, cy, r8, r5
- subs n, n, #1
- bne L(tpu)
-
-L(edu): str r5, [rp], #4
- mov r12, #0
- umaal r12, cy, r5, d
- mul r5, r11, r4
- mla r5, cy, r8, r5
- str r5, [rp]
- pop {r10,r11}
- pop {r4,r5,r6,r7,r8,r9}
- bx r14
-
-L(edx): mul r5, r11, r4
- str r5, [rp]
- pop {r10,r11}
- pop {r4,r5,r6,r7,r8,r9}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/gmp-mparam.h b/gmp/mpn/arm/v6/gmp-mparam.h
deleted file mode 100644
index c9c6851769..0000000000
--- a/gmp/mpn/arm/v6/gmp-mparam.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 700MHz ARM11 (raspberry pi) */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 29
-#define USE_PREINV_DIVREM_1 1 /* preinv always */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 33
-
-#define MUL_TOOM22_THRESHOLD 36
-#define MUL_TOOM33_THRESHOLD 117
-#define MUL_TOOM44_THRESHOLD 462
-#define MUL_TOOM6H_THRESHOLD 0 /* always */
-#define MUL_TOOM8H_THRESHOLD 620
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 130
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 573
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 209
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 305
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 181
-#define SQR_TOOM4_THRESHOLD 686
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 915
-
-#define MULMID_TOOM42_THRESHOLD 72
-
-#define MULMOD_BNM1_THRESHOLD 25
-#define SQRMOD_BNM1_THRESHOLD 30
-
-#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 476, 5}, { 21, 6}, { 11, 5}, { 25, 6}, \
- { 13, 5}, { 27, 6}, { 25, 7}, { 13, 6}, \
- { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
- { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \
- { 95,10}, { 191, 9}, { 383,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 63
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 464, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 35, 7}, { 71, 8}, { 43, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \
- { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 61
-#define SQR_FFT_THRESHOLD 3776
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 67
-#define MULLO_MUL_N_THRESHOLD 8907
-
-#define DC_DIV_QR_THRESHOLD 40
-#define DC_DIVAPPR_Q_THRESHOLD 156
-#define DC_BDIV_QR_THRESHOLD 71
-#define DC_BDIV_Q_THRESHOLD 208
-
-#define INV_MULMOD_BNM1_THRESHOLD 70
-#define INV_NEWTON_THRESHOLD 151
-#define INV_APPR_THRESHOLD 150
-
-#define BINV_NEWTON_THRESHOLD 375
-#define REDC_1_TO_REDC_2_THRESHOLD 5
-#define REDC_2_TO_REDC_N_THRESHOLD 134
-
-#define MU_DIV_QR_THRESHOLD 2130
-#define MU_DIVAPPR_Q_THRESHOLD 2130
-#define MUPI_DIV_QR_THRESHOLD 80
-#define MU_BDIV_QR_THRESHOLD 1787
-#define MU_BDIV_Q_THRESHOLD 2130
-
-#define POWM_SEC_TABLE 7,32,460,1705
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 85
-#define HGCD_APPR_THRESHOLD 119
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 333
-#define GCDEXT_DC_THRESHOLD 309
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 21
-#define GET_STR_PRECOMPUTE_THRESHOLD 41
-#define SET_STR_DC_THRESHOLD 527
-#define SET_STR_PRECOMPUTE_THRESHOLD 1323
-
-#define FAC_DSC_THRESHOLD 414
-#define FAC_ODD_THRESHOLD 154
diff --git a/gmp/mpn/arm/v6/mode1o.asm b/gmp/mpn/arm/v6/mode1o.asm
deleted file mode 100644
index a2f77a6bf5..0000000000
--- a/gmp/mpn/arm/v6/mode1o.asm
+++ /dev/null
@@ -1,95 +0,0 @@
-dnl ARM v6 mpn_modexact_1c_odd
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 9
-C Cortex-A15 7
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te smulbb
-C v6 umaal
-C v6t2 -
-C v7a -
-
-define(`up', `r0')
-define(`n', `r1')
-define(`d', `r2')
-define(`cy', `r3')
-
- .protected binvert_limb_table
-ASM_START()
-PROLOGUE(mpn_modexact_1c_odd)
- stmfd sp!, {r4, r5, r6, r7}
-
- LEA( r4, binvert_limb_table)
-
- ldr r6, [up], #4 C up[0]
-
- and r12, d, #254
- ldrb r4, [r4, r12, lsr #1]
- smulbb r12, r4, r4
- mul r12, d, r12
- rsb r12, r12, r4, asl #1
- mul r4, r12, r12
- mul r4, d, r4
- rsb r4, r4, r12, asl #1 C r4 = inverse
-
- subs n, n, #1
- sub r6, r6, cy
- mul r6, r6, r4
- beq L(end)
-
- rsb r5, r4, #0 C r5 = -inverse
-
-L(top): ldr r7, [up], #4
- mov r12, #0
- umaal r12, cy, r6, d
- mul r6, r7, r4
- mla r6, cy, r5, r6
- subs n, n, #1
- bne L(top)
-
-L(end): mov r12, #0
- umaal r12, cy, r6, d
- mov r0, cy
-
- ldmfd sp!, {r4, r5, r6, r7}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_1.asm b/gmp/mpn/arm/v6/mul_1.asm
deleted file mode 100644
index 0fcc0e46d9..0000000000
--- a/gmp/mpn/arm/v6/mul_1.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-dnl ARM mpn_mul_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.25
-C Cortex-A15 4
-
-C TODO
-C * Micro-optimise feed-in code.
-C * Optimise for n=1,2 by delaying register saving.
-C * Try using ldm/stm.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`v0',`r3')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- stmfd sp!, { r4, r5, r6, r7 }
-
- ands r6, n, #3
- mov r12, #0
- beq L(fi0)
- cmp r6, #2
- bcc L(fi1)
- beq L(fi2)
-
-L(fi3): ldr r4, [up], #4
- mov r6, #0
- ldr r5, [up], #4
- b L(lo3)
-
-L(fi0): ldr r5, [up], #4
- add rp, rp, #4
- mov r7, #0
- ldr r4, [up], #4
- b L(lo0)
-
-L(fi1): ldr r4, [up], #4
- mov r6, #0
- add rp, rp, #8
- subs n, n, #1
- beq L(1)
- ldr r5, [up], #4
- b L(lo1)
-
-L(fi2): ldr r5, [up], #4
- add rp, rp, #12
- mov r7, #0
- ldr r4, [up], #4
- b L(lo2)
-
- ALIGN(16)
-L(top): mov r6, #0
- ldr r5, [up], #4
- str r7, [rp, #-12]
-L(lo1): umaal r6, r12, r4, v0
- mov r7, #0
- ldr r4, [up], #4
- str r6, [rp, #-8]
-L(lo0): umaal r7, r12, r5, v0
- mov r6, #0
- ldr r5, [up], #4
- str r7, [rp, #-4]
-L(lo3): umaal r6, r12, r4, v0
- mov r7, #0
- ldr r4, [up], #4
- str r6, [rp], #16
-L(lo2): umaal r7, r12, r5, v0
- subs n, n, #4
- bhi L(top)
-
- mov r6, #0
- str r7, [rp, #-12]
-L(1): umaal r6, r12, r4, v0
- str r6, [rp, #-8]
- mov r0, r12
- ldmfd sp!, { r4, r5, r6, r7 }
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/mul_2.asm b/gmp/mpn/arm/v6/mul_2.asm
deleted file mode 100644
index 1679542a3c..0000000000
--- a/gmp/mpn/arm/v6/mul_2.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-dnl ARM mpn_mul_2.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.25
-C Cortex-A15 2.5
-
-C TODO
-C * This is a trivial edit of the addmul_2 code. Check for simplifications,
-C and possible speedups to 2.0 c/l.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`vp',`r3')
-
-define(`v0',`r6')
-define(`v1',`r7')
-define(`u0',`r3')
-define(`u1',`r9')
-
-define(`cya',`r8')
-define(`cyb',`r12')
-
-
-ASM_START()
-PROLOGUE(mpn_mul_2)
- push { r4, r5, r6, r7, r8, r9 }
-
- ldm vp, { v0, v1 }
- mov cya, #0
- mov cyb, #0
-
- tst n, #1
- beq L(evn)
-L(odd): mov r5, #0
- ldr u0, [up, #0]
- mov r4, #0
- tst n, #2
- beq L(fi1)
-L(fi3): sub up, up, #12
- sub rp, rp, #16
- b L(lo3)
-L(fi1): sub n, n, #1
- sub up, up, #4
- sub rp, rp, #8
- b L(lo1)
-L(evn): mov r4, #0
- ldr u1, [up, #0]
- mov r5, #0
- tst n, #2
- bne L(fi2)
-L(fi0): sub up, up, #8
- sub rp, rp, #12
- b L(lo0)
-L(fi2): subs n, n, #2
- sub rp, rp, #4
- bls L(end)
-
- ALIGN(16)
-L(top): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- mov r4, #0
- umaal r5, cyb, u1, v1
-L(lo1): ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- mov r5, #0
- umaal r4, cyb, u0, v1
-L(lo0): ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- mov r4, #0
- umaal r5, cyb, u1, v1
-L(lo3): ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- mov r5, #0
- umaal r4, cyb, u0, v1
- subs n, n, #4
- bhi L(top)
-
-L(end): umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #4]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #8]
- str cya, [rp, #12]
- mov r0, cyb
-
- pop { r4, r5, r6, r7, r8, r9 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/popham.asm b/gmp/mpn/arm/v6/popham.asm
deleted file mode 100644
index 44c8f2361c..0000000000
--- a/gmp/mpn/arm/v6/popham.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-dnl ARM mpn_popcount and mpn_hamdist.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C popcount hamdist
-C cycles/limb cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 8.94 9.47
-C Cortex-A15 5.67 6.44
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 usada8
-C v6t2 -
-C v7a -
-
-ifdef(`OPERATION_popcount',`
- define(`func',`mpn_popcount')
- define(`ap', `r0')
- define(`n', `r1')
- define(`a0', `r2')
- define(`a1', `r3')
- define(`s', `r5')
- define(`b_01010101', `r6')
- define(`b_00110011', `r7')
- define(`b_00001111', `r8')
- define(`zero', `r9')
- define(`POPC', `$1')
- define(`HAMD', `dnl')
-')
-ifdef(`OPERATION_hamdist',`
- define(`func',`mpn_hamdist')
- define(`ap', `r0')
- define(`bp', `r1')
- define(`n', `r2')
- define(`a0', `r6')
- define(`a1', `r7')
- define(`b0', `r4')
- define(`b1', `r5')
- define(`s', `r11')
- define(`b_01010101', `r8')
- define(`b_00110011', `r9')
- define(`b_00001111', `r10')
- define(`zero', `r3')
- define(`POPC', `dnl')
- define(`HAMD', `$1')
-')
-
-
-ASM_START()
-PROLOGUE(func)
-POPC(` push { r4-r9 } ')
-HAMD(` push { r4-r11 } ')
-
- ldr b_01010101, =0x55555555
- mov r12, #0
- ldr b_00110011, =0x33333333
- mov zero, #0
- ldr b_00001111, =0x0f0f0f0f
-
- tst n, #1
- beq L(evn)
-
-L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1
-HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1
-HAMD(` eor a1, a1, b1 ')
- and r4, b_01010101, a1, lsr #1
- sub a1, a1, r4
- and r4, a1, b_00110011
- bic r5, a1, b_00110011
- add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
- subs n, n, #1
- b L(mid)
-
-L(evn): mov s, #0
-
-L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1
-HAMD(` ldrd b0, b1, [bp], #8')
-HAMD(` eor a0, a0, b0 ')
-HAMD(` eor a1, a1, b1 ')
- subs n, n, #2
- usada8 r12, s, zero, r12
- and r4, b_01010101, a0, lsr #1
- sub a0, a0, r4
- and r4, b_01010101, a1, lsr #1
- sub a1, a1, r4
- and r4, a0, b_00110011
- bic r5, a0, b_00110011
- add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
- and r4, a1, b_00110011
- bic r5, a1, b_00110011
- add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
- add r5, a0, a1 C 8 4-bit accumulators, 0-8
-L(mid): and r4, r5, b_00001111
- bic r5, r5, b_00001111
- add s, r4, r5, lsr #4 C 4 8-bit accumulators
- bne L(top)
-
- usada8 r0, s, zero, r12
-POPC(` pop { r4-r9 } ')
-HAMD(` pop { r4-r11 } ')
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/sqr_basecase.asm b/gmp/mpn/arm/v6/sqr_basecase.asm
deleted file mode 100644
index d52970aaa7..0000000000
--- a/gmp/mpn/arm/v6/sqr_basecase.asm
+++ /dev/null
@@ -1,518 +0,0 @@
-dnl ARM v6 mpn_sqr_basecase.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C Code structure:
-C
-C
-C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
-C | | | |
-C | | | |
-C | | | |
-C \|/ \|/ \|/ \|/
-C ____________ ____________
-C / \ / \
-C \|/ \ \|/ \
-C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
-C \ /|\ \ /|\
-C \____________/ \____________/
-C \ /
-C \ /
-C \ /
-C tail(0m2) tail(1m2)
-C \ /
-C \ /
-C sqr_diag_addlsh1
-
-C TODO
-C * Further tweak counter and updates in outer loops. (This could save
-C perhaps 5n cycles).
-C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
-C (This could save 2-3 cycles for n > 4.)
-C * Optimise sqr_diag_addlsh1 loop. (This could save O(n) cycles.)
-C * Implement larger final corners (xit/tix). Also stop loops earlier
-C suppressing writes of upper-most rp[] values. (This could save 10-20
-C cycles for n > 4.)
-C * Is the branch table really faster than discrete branches?
-
-define(`rp', r0)
-define(`up', r1)
-define(`n', r2)
-
-define(`v0', r3)
-define(`v1', r6)
-define(`i', r8)
-define(`n_saved', r14)
-define(`cya', r11)
-define(`cyb', r12)
-define(`u0', r7)
-define(`u1', r9)
-
-ASM_START()
-PROLOGUE(mpn_sqr_basecase)
- and r12, n, #3
- cmp n, #4
- addgt r12, r12, #4
- add pc, pc, r12, lsl #2
- nop
- b L(4)
- b L(1)
- b L(2)
- b L(3)
- b L(0m4)
- b L(1m4)
- b L(2m4)
- b L(3m4)
-
-
-L(1m4): push {r4-r10,r11,r14}
- mov n_saved, n
- sub i, n, #4
- sub n, n, #2
- add r10, pc, #L(am2_2m4)-.-8
- ldm up, {v0,v1,u0}
- sub up, up, #4
- mov cyb, #0
- mov r5, #0
- umull r4, cya, v1, v0
- str r4, [rp], #-12
- mov r4, #0
- b L(ko0)
-
-L(3m4): push {r4-r10,r11,r14}
- mov n_saved, n
- sub i, n, #4
- sub n, n, #2
- add r10, pc, #L(am2_0m4)-.-8
- ldm up, {v0,v1,u0}
- add up, up, #4
- mov cyb, #0
- mov r5, #0
- umull r4, cya, v1, v0
- str r4, [rp], #-4
- mov r4, #0
- b L(ko2)
-
-L(2m4): push {r4-r10,r11,r14}
- mov n_saved, n
- sub i, n, #4
- sub n, n, #2
- add r10, pc, #L(am2_3m4)-.-8
- ldm up, {v0,v1,u1}
- mov cyb, #0
- mov r4, #0
- umull r5, cya, v1, v0
- str r5, [rp], #-8
- mov r5, #0
- b L(ko1)
-
-L(0m4): push {r4-r10,r11,r14}
- mov n_saved, n
- sub i, n, #4
- sub n, n, #2
- add r10, pc, #L(am2_1m4)-.-8
- ldm up, {v0,v1,u1}
- mov cyb, #0
- mov r4, #0
- add up, up, #8
- umull r5, cya, v1, v0
- str r5, [rp, #0]
- mov r5, #0
-
-L(top): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- mov r4, #0
- umaal r5, cyb, u1, v1
-L(ko2): ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- mov r5, #0
- umaal r4, cyb, u0, v1
-L(ko1): ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- mov r4, #0
- umaal r5, cyb, u1, v1
-L(ko0): ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- mov r5, #0
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(top)
- bx r10
-
-L(evnloop):
- subs i, n, #4
- sub n, n, #2
- blt L(tix)
- ldm up, {v0,v1,u0}
- add up, up, #4
- mov cya, #0
- mov cyb, #0
- ldm rp, {r4,r5}
- sub rp, rp, #4
- umaal r4, cya, v1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- b L(lo2)
-L(ua2): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- umaal r5, cyb, u1, v1
-L(lo2): ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- umaal r4, cyb, u0, v1
- ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- ldr r5, [rp, #8]
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(ua2)
-L(am2_0m4):
- umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #4]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #8]
- str cya, [rp, #12]
- str cyb, [rp, #16]
- sub up, up, n, lsl #2
- sub rp, rp, n, lsl #2
- add up, up, #8
- sub i, n, #4
- sub n, n, #2
- ldm up, {v0,v1,u0}
- sub up, up, #4
- mov cya, #0
- mov cyb, #0
- ldr r4, [rp, #24]
- ldr r5, [rp, #28]
- add rp, rp, #12
- umaal r4, cya, v1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- b L(lo0)
-L(ua0): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- umaal r4, cyb, u0, v1
- ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- umaal r5, cyb, u1, v1
-L(lo0): ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- ldr r5, [rp, #8]
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(ua0)
-L(am2_2m4):
- umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #4]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #8]
- str cya, [rp, #12]
- str cyb, [rp, #16]
- sub up, up, n, lsl #2
- sub rp, rp, n, lsl #2
- add up, up, #8
- add rp, rp, #24
- b L(evnloop)
-
-
-L(oddloop):
- subs i, n, #4
- sub n, n, #2
- blt L(xit)
- ldm up, {v0,v1,u1}
- mov cya, #0
- mov cyb, #0
- sub rp, rp, #8
- ldr r5, [rp, #8]
- ldr r4, [rp, #12]
- umaal r5, cya, v1, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- b L(lo1)
-L(ua1): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- umaal r4, cyb, u0, v1
-L(lo1): ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- ldr r5, [rp, #8]
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(ua1)
-L(am2_3m4):
- umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #4]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #8]
- str cya, [rp, #12]
- str cyb, [rp, #16]
- sub up, up, n, lsl #2
- sub rp, rp, n, lsl #2
- add up, up, #8
- add rp, rp, #24
- subs i, n, #4
- sub n, n, #2
- ldm up, {v0,v1,u1}
- mov cya, #0
- mov cyb, #0
- ldr r5, [rp, #0]
- ldr r4, [rp, #4]
- add up, up, #8
- umaal r5, cya, v1, v0
- str r5, [rp, #0]
- ldr r5, [rp, #8]
- bls L(e3)
-L(ua3): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- umaal r4, cyb, u0, v1
- ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- ldr r5, [rp, #8]
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(ua3)
-L(e3):
-L(am2_1m4):
- umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
- str r4, [rp, #4]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #8]
- str cya, [rp, #12]
- str cyb, [rp, #16]
- sub up, up, n, lsl #2
- sub rp, rp, n, lsl #2
- add up, up, #8
- add rp, rp, #24
- b L(oddloop)
-
-L(xit): ldm up!, {v0,u0}
- ldr cya, [rp], #12
- mov cyb, #0
- umaal cya, cyb, u0, v0
- b L(sqr_diag_addlsh1)
-
-L(tix): ldm up!, {v0,v1,u0}
- ldm rp, {r4,r5}
- mov cya, #0
- mov cyb, #0
- umaal r4, cya, v1, v0
- umaal r5, cya, u0, v0
- stm rp, {r4,r5}
- umaal cya, cyb, u0, v1
- add rp, rp, #20
-C b L(sqr_diag_addlsh1)
-
-
-define(`w0', r6)
-define(`w1', r7)
-define(`w2', r8)
-define(`rbx', r9)
-
-L(sqr_diag_addlsh1):
- str cya, [rp, #-12]
- str cyb, [rp, #-8]
- sub n, n_saved, #1
- sub up, up, n_saved, lsl #2
- sub rp, rp, n_saved, lsl #3
- ldr r3, [up], #4
- umull w1, r5, r3, r3
- mov w2, #0
- mov r10, #0
-C cmn r0, #0 C clear cy (already clear by luck)
- b L(lm)
-
-L(tsd): adds w0, w0, rbx
- adcs w1, w1, r4
- str w0, [rp, #0]
-L(lm): ldr w0, [rp, #4]
- str w1, [rp, #4]
- ldr w1, [rp, #8]!
- add rbx, r5, w2
- adcs w0, w0, w0
- ldr r3, [up], #4
- adcs w1, w1, w1
- adc w2, r10, r10
- umull r4, r5, r3, r3
- subs n, n, #1
- bne L(tsd)
-
- adds w0, w0, rbx
- adcs w1, w1, r4
- adc w2, r5, w2
- stm rp, {w0,w1,w2}
-
- pop {r4-r10,r11,pc}
-
-
-C Straight line code for n <= 4
-
-L(1): ldr r3, [up, #0]
- umull r1, r2, r3, r3
- stm rp, {r1,r2}
- bx r14
-
-L(2): push {r4-r5}
- ldm up, {r5,r12}
- umull r1, r2, r5, r5
- umull r3, r4, r12, r12
- umull r5, r12, r5, r12
- adds r5, r5, r5
- adcs r12, r12, r12
- adc r4, r4, #0
- adds r2, r2, r5
- adcs r3, r3, r12
- adc r4, r4, #0
- stm rp, {r1,r2,r3,r4}
- pop {r4-r5}
- bx r14
-
-L(3): push {r4-r11}
- ldm up, {r7,r8,r9}
- umull r1, r2, r7, r7
- umull r3, r4, r8, r8
- umull r5, r6, r9, r9
- umull r10, r11, r7, r8
- mov r12, #0
- umlal r11, r12, r7, r9
- mov r7, #0
- umlal r12, r7, r8, r9
- adds r10, r10, r10
- adcs r11, r11, r11
- adcs r12, r12, r12
- adcs r7, r7, r7
- adc r6, r6, #0
- adds r2, r2, r10
- adcs r3, r3, r11
- adcs r4, r4, r12
- adcs r5, r5, r7
- adc r6, r6, #0
- stm rp, {r1,r2,r3,r4,r5,r6}
- pop {r4-r11}
- bx r14
-
-L(4): push {r4-r11, r14}
- ldm up, {r9,r10,r11,r12}
- umull r1, r2, r9, r9
- umull r3, r4, r10, r10
- umull r5, r6, r11, r11
- umull r7, r8, r12, r12
- stm rp, {r1,r2,r3,r4,r5,r6,r7}
- umull r1, r2, r9, r10
- mov r3, #0
- umlal r2, r3, r9, r11
- mov r4, #0
- umlal r3, r4, r9, r12
- mov r5, #0
- umlal r3, r5, r10, r11
- umaal r4, r5, r10, r12
- mov r6, #0
- umlal r5, r6, r11, r12
- adds r1, r1, r1
- adcs r2, r2, r2
- adcs r3, r3, r3
- adcs r4, r4, r4
- adcs r5, r5, r5
- adcs r6, r6, r6
- adc r7, r8, #0
- add rp, rp, #4
- ldm rp, {r8,r9,r10,r11,r12,r14}
- adds r1, r1, r8
- adcs r2, r2, r9
- adcs r3, r3, r10
- adcs r4, r4, r11
- adcs r5, r5, r12
- adcs r6, r6, r14
- adc r7, r7, #0
- stm rp, {r1,r2,r3,r4,r5,r6,r7}
- pop {r4-r11, pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6/submul_1.asm b/gmp/mpn/arm/v6/submul_1.asm
deleted file mode 100644
index 8a21733a0a..0000000000
--- a/gmp/mpn/arm/v6/submul_1.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-dnl ARM mpn_submul_1.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM: -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.75
-C Cortex-A15 4.0
-
-C This loop complements U on the fly,
-C U' = B^n - 1 - U
-C and then uses that
-C R - U*v = R + U'*v + v - B^n v
-
-C TODO
-C * Micro-optimise feed-in code.
-C * Optimise for n=1,2 by delaying register saving.
-C * Try using ldm/stm.
-
-define(`rp',`r0')
-define(`up',`r1')
-define(`n', `r2')
-define(`v0',`r3')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- stmfd sp!, { r4, r5, r6, r7 }
-
- ands r6, n, #3
- mov r12, v0
- beq L(fi0)
- cmp r6, #2
- bcc L(fi1)
- beq L(fi2)
-
-L(fi3): ldr r4, [up], #12
- mvn r4, r4
- ldr r6, [rp, #0]
- ldr r5, [up, #-8]
- b L(lo3)
-
-L(fi0): ldr r5, [up], #16
- mvn r5, r5
- ldr r7, [rp], #4
- ldr r4, [up, #-12]
- b L(lo0)
-
-L(fi1): ldr r4, [up], #4
- mvn r4, r4
- ldr r6, [rp], #8
- subs n, n, #1
- beq L(1)
- ldr r5, [up]
- b L(lo1)
-
-L(fi2): ldr r5, [up], #8
- mvn r5, r5
- ldr r7, [rp], #12
- ldr r4, [up, #-4]
- b L(lo2)
-
- ALIGN(16)
-L(top): ldr r6, [rp, #-8]
- ldr r5, [up]
- str r7, [rp, #-12]
-L(lo1): umaal r6, r12, r4, v0
- add up, up, #16
- mvn r5, r5
- ldr r7, [rp, #-4]
- ldr r4, [up, #-12]
- str r6, [rp, #-8]
-L(lo0): umaal r7, r12, r5, v0
- mvn r4, r4
- ldr r6, [rp, #0]
- ldr r5, [up, #-8]
- str r7, [rp, #-4]
-L(lo3): umaal r6, r12, r4, v0
- mvn r5, r5
- ldr r7, [rp, #4]
- ldr r4, [up, #-4]
- str r6, [rp], #16
-L(lo2): umaal r7, r12, r5, v0
- mvn r4, r4
- subs n, n, #4
- bhi L(top)
-
- ldr r6, [rp, #-8]
- str r7, [rp, #-12]
-L(1): umaal r6, r12, r4, v0
- str r6, [rp, #-8]
- sub r0, v0, r12
- ldmfd sp!, { r4, r5, r6, r7 }
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/divrem_1.asm b/gmp/mpn/arm/v6t2/divrem_1.asm
deleted file mode 100644
index be24615acb..0000000000
--- a/gmp/mpn/arm/v6t2/divrem_1.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-dnl ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C norm unorm frac
-C StrongARM - - -
-C XScale - - -
-C Cortex-A7 ? ? ?
-C Cortex-A8 ? ? ?
-C Cortex-A9 13 14 13
-C Cortex-A15 11.4 11.8 11.1
-
-C TODO
-C * Optimise inner-loops better, they could likely run a cycle or two faster.
-C * Decrease register usage, streamline non-loop code.
-
-define(`qp_arg', `r0')
-define(`fn', `r1')
-define(`up_arg', `r2')
-define(`n_arg', `r3')
-define(`d_arg', `0')
-define(`dinv_arg',`4')
-define(`cnt_arg', `8')
-
-define(`n', `r9')
-define(`qp', `r5')
-define(`up', `r6')
-define(`cnt', `r7')
-define(`tnc', `r10')
-define(`dinv', `r0')
-define(`d', `r4')
-
-ASM_START()
-PROLOGUE(mpn_preinv_divrem_1)
- stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
- ldr d, [sp, #9*4+d_arg]
- ldr cnt, [sp, #9*4+cnt_arg]
- str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
- sub n, r3, #1
- add r3, r1, n
- cmp d, #0
- add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
- add up, up_arg, n, lsl #2 C put up at U[] end
- ldr dinv, [sp, #9*4+dinv_arg]
- blt L(nent)
- b L(uent)
-EPILOGUE()
-
-PROLOGUE(mpn_divrem_1)
- stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
- sub n, r3, #1
- ldr d, [sp, #9*4+d_arg] C d
- str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
- add r3, r1, n
- cmp d, #0
- add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
- add up, up_arg, n, lsl #2 C put up at U[] end
- blt L(normalised)
-
-L(unnorm):
- clz cnt, d
- mov r0, d, lsl cnt C pass d << cnt
- bl mpn_invert_limb
-L(uent):
- mov d, d, lsl cnt C d <<= cnt
- cmp n, #0
- mov r1, #0 C r
- blt L(frac)
-
- ldr r11, [up, #0]
-
- rsb tnc, cnt, #32
- mov r1, r11, lsr tnc
- mov r11, r11, lsl cnt
- beq L(uend)
-
- ldr r3, [up, #-4]!
- orr r2, r11, r3, lsr tnc
- b L(mid)
-
-L(utop):
- mls r1, d, r8, r11
- mov r11, r3, lsl cnt
- ldr r3, [up, #-4]!
- cmp r1, r2
- addhi r1, r1, d
- subhi r8, r8, #1
- orr r2, r11, r3, lsr tnc
- cmp r1, d
- bcs L(ufx)
-L(uok): str r8, [qp], #-4
-L(mid): add r8, r1, #1
- mov r11, r2
- umlal r2, r8, r1, dinv
- subs n, n, #1
- bne L(utop)
-
- mls r1, d, r8, r11
- mov r11, r3, lsl cnt
- cmp r1, r2
- addhi r1, r1, d
- subhi r8, r8, #1
- cmp r1, d
- rsbcs r1, d, r1
- addcs r8, r8, #1
- str r8, [qp], #-4
-
-L(uend):add r8, r1, #1
- mov r2, r11
- umlal r2, r8, r1, dinv
- mls r1, d, r8, r11
- cmp r1, r2
- addhi r1, r1, d
- subhi r8, r8, #1
- cmp r1, d
- rsbcs r1, d, r1
- addcs r8, r8, #1
- str r8, [qp], #-4
-L(frac):
- ldr r2, [sp, #9*4+d_arg] C fn
- cmp r2, #0
- beq L(fend)
-
-L(ftop):mov r6, #0
- add r3, r1, #1
- umlal r6, r3, r1, dinv
- mov r8, #0
- mls r1, d, r3, r8
- cmp r1, r6
- addhi r1, r1, d
- subhi r3, r3, #1
- subs r2, r2, #1
- str r3, [qp], #-4
- bne L(ftop)
-
-L(fend):mov r11, r1, lsr cnt
-L(rtn): mov r0, r11
- ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-L(normalised):
- mov r0, d
- bl mpn_invert_limb
-L(nent):
- cmp n, #0
- mov r11, #0 C r
- blt L(nend)
-
- ldr r11, [up, #0]
- cmp r11, d
- movlo r2, #0 C hi q limb
- movhs r2, #1 C hi q limb
- subhs r11, r11, d
-
- str r2, [qp], #-4
- cmp n, #0
- beq L(nend)
-
-L(ntop):ldr r1, [up, #-4]!
- add r12, r11, #1
- umlal r1, r12, r11, dinv
- ldr r3, [up, #0]
- mls r11, d, r12, r3
- cmp r11, r1
- addhi r11, r11, d
- subhi r12, r12, #1
- cmp d, r11
- bls L(nfx)
-L(nok): str r12, [qp], #-4
- subs n, n, #1
- bne L(ntop)
-
-L(nend):mov r1, r11 C r
- mov cnt, #0 C shift cnt
- b L(frac)
-
-L(nfx): add r12, r12, #1
- rsb r11, d, r11
- b L(nok)
-L(ufx): rsb r1, d, r1
- add r8, r8, #1
- b L(uok)
-EPILOGUE()
diff --git a/gmp/mpn/arm/v6t2/gcd_1.asm b/gmp/mpn/arm/v6t2/gcd_1.asm
deleted file mode 100644
index 2063647963..0000000000
--- a/gmp/mpn/arm/v6t2/gcd_1.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-dnl ARM v6t2 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.3
-C Cortex-A15 3.5
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop better.
-C * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up', `r0')
-define(`n', `r1')
-define(`v0', `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push {r4, r7, lr}
- ldr r3, [up] C U low limb
-
- orr r3, r3, v0
- rbit r4, r3
- clz r4, r4 C min(ctz(u0),ctz(v0))
-
- rbit r12, v0
- clz r12, r12
- mov v0, v0, lsr r12
-
- mov r7, v0
-
- cmp n, #1
- bne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr r3, [up]
- cmp v0, r3, lsr #BMOD_THRES_LOG2
- bhi L(red1)
-
-L(bmod):mov r3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- blo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov r3, r0
-L(red1):cmp r3, #0
- rbit r12, r3
- clz r12, r12
- bne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top): movcs r3, r1 C if x-y < 0
- movcs r7, r0 C use x,y-x
-L(mid): mov r3, r3, lsr r12 C
- mov r0, r3 C
- subs r1, r7, r3 C
- rsb r3, r7, r3 C
- rbit r12, r1
- clz r12, r12 C
- bne L(top) C
-
-L(end): mov r0, r7, lsl r4
- pop {r4, r7, pc}
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/gmp/mpn/arm/v7a/cora15/addmul_1.asm
deleted file mode 100644
index c2277b32b2..0000000000
--- a/gmp/mpn/arm/v7a/cora15/addmul_1.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-dnl ARM mpn_addmul_1 optimised for A15.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C StrongARM: -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 6 3.25
-C Cortex-A15 2 this
-
-C This code uses umlal for adding in the rp[] data, keeping the recurrency path
-C separate from any multiply instructions. It performs well on A15, at umlal's
-C bandwidth.
-C
-C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
-C for all loads and stores. Alternatively, it could do 2-way or 4-way, but
-C then alignment aware code will be necessary (adding O(1) bookkeeping
-C overhead).
-C
-C We don't use r12 due to ldrd and strd limitations.
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`v0', `r3')
-
-define(`w0', `r10') define(`w1', `r11')
-define(`u0', `r8') define(`u1', `r9')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- push { r4-r11 }
-
- ands r6, n, #3
- sub n, n, #3
- beq L(b00)
- cmp r6, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): mov r6, #0
- cmn r13, #0 C carry clear
- ldr u1, [up], #-4
- ldr w1, [rp], #-4
- mov r7, #0
- b L(mid)
-
-L(b00): ldrd u0, u1, [up]
- ldrd w0, w1, [rp]
- mov r6, #0
- umlal w0, r6, u0, v0
- cmn r13, #0 C carry clear
- mov r7, #0
- str w0, [rp]
- b L(mid)
-
-L(b10): ldrd u0, u1, [up], #8
- ldrd w0, w1, [rp]
- mov r4, #0
- umlal w0, r4, u0, v0
- cmn r13, #0 C carry clear
- mov r5, #0
- str w0, [rp], #8
- umlal w1, r5, u1, v0
- tst n, n
- bmi L(end)
- b L(top)
-
-L(b01): mov r4, #0
- ldr u1, [up], #4
- ldr w1, [rp], #4
- mov r5, #0
- umlal w1, r5, u1, v0
- tst n, n
- bmi L(end)
-
- ALIGN(16)
-L(top): ldrd u0, u1, [up, #0]
- adcs r4, r4, w1
- ldrd w0, w1, [rp, #0]
- mov r6, #0
- umlal w0, r6, u0, v0 C 1 2
- adcs r5, r5, w0
- mov r7, #0
- strd r4, r5, [rp, #-4]
-L(mid): umlal w1, r7, u1, v0 C 2 3
- ldrd u0, u1, [up, #8]
- adcs r6, r6, w1
- ldrd w0, w1, [rp, #8]
- mov r4, #0
- umlal w0, r4, u0, v0 C 3 4
- adcs r7, r7, w0
- mov r5, #0
- strd r6, r7, [rp, #4]
- umlal w1, r5, u1, v0 C 0 1
- sub n, n, #4
- add up, up, #16
- add rp, rp, #16
- tst n, n
- bpl L(top)
-
-L(end): adcs r4, r4, w1
- str r4, [rp, #-4]
- adc r0, r5, #0
- pop { r4-r11 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/aors_n.asm b/gmp/mpn/arm/v7a/cora15/aors_n.asm
deleted file mode 100644
index dc3f83992e..0000000000
--- a/gmp/mpn/arm/v7a/cora15/aors_n.asm
+++ /dev/null
@@ -1,162 +0,0 @@
-dnl ARM mpn_add_n/mpn_sub_n optimised for A15.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C StrongARM: -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.55 2.5
-C Cortex-A15 1.27 this
-
-C This was a major improvement compared to the code we had before, but it might
-C not be the best 8-way code possible. We've tried some permutations of auto-
-C increments and separate pointer updates, but they all ran at the same speed
-C on A15.
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`OPERATION_add_n', `
- define(`ADDSUBC', adcs)
- define(`IFADD', `$1')
- define(`SETCY', `cmp $1, #1')
- define(`RETVAL', `adc r0, n, #0')
- define(`RETVAL2', `adc r0, n, #1')
- define(`func', mpn_add_n)
- define(`func_nc', mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(`ADDSUBC', sbcs)
- define(`IFADD', `')
- define(`SETCY', `rsbs $1, $1, #0')
- define(`RETVAL', `sbc r0, r0, r0
- and r0, r0, #1')
- define(`RETVAL2', `RETVAL')
- define(`func', mpn_sub_n)
- define(`func_nc', mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ASM_START()
-PROLOGUE(func_nc)
- ldr r12, [sp]
- b L(ent)
-EPILOGUE()
-PROLOGUE(func)
- mov r12, #0
-L(ent): push { r4-r9 }
-
- ands r6, n, #3
- mov n, n, lsr #2
- beq L(b00)
- cmp r6, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): ldr r5, [up], #4
- ldr r7, [vp], #4
- SETCY( r12)
- ADDSUBC r9, r5, r7
- ldrd r4, r5, [up, #0]
- ldrd r6, r7, [vp, #0]
- str r9, [rp], #-4
- b L(lo)
-
-L(b00): ldrd r4, r5, [up], #-8
- ldrd r6, r7, [vp], #-8
- SETCY( r12)
- sub rp, rp, #16
- b L(mid)
-
-L(b01): ldr r5, [up], #-4
- ldr r7, [vp], #-4
- SETCY( r12)
- ADDSUBC r9, r5, r7
- str r9, [rp], #-12
- tst n, n
- beq L(wd1)
-L(gt1): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- b L(mid)
-
-L(b10): ldrd r4, r5, [up]
- ldrd r6, r7, [vp]
- SETCY( r12)
- sub rp, rp, #8
- b L(lo)
-
- ALIGN(16)
-L(top): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- strd r8, r9, [rp, #8]
-L(mid): ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- ldrd r4, r5, [up, #16]
- ldrd r6, r7, [vp, #16]
- strd r8, r9, [rp, #16]
- ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- sub n, n, #2
- tst n, n
- bmi L(dne)
- ldrd r4, r5, [up, #24]
- ldrd r6, r7, [vp, #24]
- strd r8, r9, [rp, #24]
- ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- ldrd r4, r5, [up, #32]!
- ldrd r6, r7, [vp, #32]!
- strd r8, r9, [rp, #32]!
-L(lo): ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- tst n, n
- bne L(top)
-
-L(end): strd r8, r9, [rp, #8]
-L(wd1): RETVAL
- pop { r4-r9 }
- bx r14
-L(dne): strd r8, r9, [rp, #24]
- RETVAL2
- pop { r4-r9 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
deleted file mode 100644
index b9e5cd3f79..0000000000
--- a/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
+++ /dev/null
@@ -1,158 +0,0 @@
-dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C StrongARM: -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 3.75 3
-C Cortex-A15 1.78 this
-
-C This code does not run as well as one could have hoped, since 1.5 c/l seems
-C realistic for this insn mix.
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`cnd',`r0')
-define(`rp', `r1')
-define(`up', `r2')
-define(`vp', `r3')
-define(`n', `r12')
-
-ifdef(`OPERATION_cnd_add_n', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`IFADD', `$1')
- define(`INITCY', `cmn r0, #0')
- define(`RETVAL', `adc r0, n, #0')
- define(`RETVAL2', `adc r0, n, #1')
- define(`func', mpn_cnd_add_n)
- define(`func_nc', mpn_add_nc)')
-ifdef(`OPERATION_cnd_sub_n', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`IFADD', `')
- define(`INITCY', `cmp r0, #0')
- define(`RETVAL', `sbc r0, r0, r0
- and r0, r0, #1')
- define(`RETVAL2', `RETVAL')
- define(`func', mpn_cnd_sub_n)
- define(`func_nc', mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- ldr n, [sp]
- push { r4-r9 }
-
- cmp cnd, #1
- sbc cnd, cnd, cnd C conditionally set to 0xffffffff
-
- ands r6, n, #3
- mov n, n, lsr #2
- beq L(b00)
- cmp r6, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): ldr r5, [up], #4
- ldr r7, [vp], #4
- bic r7, r7, cnd
- ADDSUB r9, r5, r7
- ldrd r4, r5, [up, #0]
- ldrd r6, r7, [vp, #0]
- bic r6, r6, cnd
- bic r7, r7, cnd
- str r9, [rp], #-4
- b L(lo)
-
-L(b00): ldrd r4, r5, [up], #-8
- ldrd r6, r7, [vp], #-8
- bic r6, r6, cnd
- bic r7, r7, cnd
- INITCY
- sub rp, rp, #16
- b L(mid)
-
-L(b01): ldr r5, [up], #-4
- ldr r7, [vp], #-4
- bic r7, r7, cnd
- ADDSUB r9, r5, r7
- str r9, [rp], #-12
- tst n, n
- beq L(wd1)
-L(gt1): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- bic r6, r6, cnd
- bic r7, r7, cnd
- b L(mid)
-
-L(b10): ldrd r4, r5, [up]
- ldrd r6, r7, [vp]
- bic r6, r6, cnd
- bic r7, r7, cnd
- INITCY
- sub rp, rp, #8
- b L(lo)
-
- ALIGN(16)
-L(top): ldrd r6, r7, [vp, #8]
- ldrd r4, r5, [up, #8]
- bic r6, r6, cnd
- bic r7, r7, cnd
- strd r8, r9, [rp, #8]
-L(mid): ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- ldrd r6, r7, [vp, #16]!
- ldrd r4, r5, [up, #16]!
- bic r6, r6, cnd
- bic r7, r7, cnd
- sub n, n, #1
- strd r8, r9, [rp, #16]!
-L(lo): ADDSUBC r8, r4, r6
- ADDSUBC r9, r5, r7
- tst n, n
- bne L(top)
-
-L(end): strd r8, r9, [rp, #8]
-L(wd1): RETVAL
- pop { r4-r9 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/com.asm b/gmp/mpn/arm/v7a/cora15/com.asm
deleted file mode 100644
index a258afe934..0000000000
--- a/gmp/mpn/arm/v7a/cora15/com.asm
+++ /dev/null
@@ -1,180 +0,0 @@
-dnl ARM mpn_com optimised for A15.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 2.5
-C Cortex-A15 1.0
-
-C This is great A15 core register code, but it is a bit large.
-C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
-define(`UNROLL', 4x2) C alternatives: 4 4x2
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-
-ASM_START()
-PROLOGUE(mpn_com)
- push { r4-r5,r8-r9 }
-
-ifelse(FEEDIN_VARIANT,0,`
- ands r12, n, #3
- mov n, n, lsr #2
- beq L(b00a)
- tst r12, #1
- beq L(bx0)
- ldr r5, [up], #4
- mvn r9, r5
- str r9, [rp], #4
- tst r12, #2
- beq L(b00)
-L(bx0): ldrd r4, r5, [up, #0]
- sub rp, rp, #8
- b L(lo)
-L(b00): tst n, n
- beq L(wd1)
-L(b00a):ldrd r4, r5, [up], #-8
- sub rp, rp, #16
- b L(mid)
-')
-ifelse(FEEDIN_VARIANT,1,`
- and r12, n, #3
- mov n, n, lsr #2
- tst r12, #1
- beq L(bx0)
- ldr r5, [up], #4
- mvn r9, r5
- str r9, [rp], #4
-L(bx0): tst r12, #2
- beq L(b00)
- ldrd r4, r5, [up, #0]
- sub rp, rp, #8
- b L(lo)
-L(b00): tst n, n
- beq L(wd1)
- ldrd r4, r5, [up], #-8
- sub rp, rp, #16
- b L(mid)
-')
-ifelse(FEEDIN_VARIANT,2,`
- ands r12, n, #3
- mov n, n, lsr #2
- beq L(b00)
- cmp r12, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): ldr r5, [up], #4
- mvn r9, r5
- ldrd r4, r5, [up, #0]
- str r9, [rp], #-4
- b L(lo)
-
-L(b00): ldrd r4, r5, [up], #-8
- sub rp, rp, #16
- b L(mid)
-
-L(b01): ldr r5, [up], #-4
- mvn r9, r5
- str r9, [rp], #-12
- tst n, n
- beq L(wd1)
-L(gt1): ldrd r4, r5, [up, #8]
- b L(mid)
-
-L(b10): ldrd r4, r5, [up]
- sub rp, rp, #8
- b L(lo)
-')
- ALIGN(16)
-ifelse(UNROLL,4,`
-L(top): ldrd r4, r5, [up, #8]
- strd r8, r9, [rp, #8]
-L(mid): mvn r8, r4
- mvn r9, r5
- ldrd r4, r5, [up, #16]!
- strd r8, r9, [rp, #16]!
- sub n, n, #1
-L(lo): mvn r8, r4
- mvn r9, r5
- tst n, n
- bne L(top)
-')
-ifelse(UNROLL,4x2,`
-L(top): ldrd r4, r5, [up, #8]
- strd r8, r9, [rp, #8]
-L(mid): mvn r8, r4
- mvn r9, r5
- ldrd r4, r5, [up, #16]
- strd r8, r9, [rp, #16]
- mvn r8, r4
- mvn r9, r5
- sub n, n, #2
- tst n, n
- bmi L(dne)
- ldrd r4, r5, [up, #24]
- strd r8, r9, [rp, #24]
- mvn r8, r4
- mvn r9, r5
- ldrd r4, r5, [up, #32]!
- strd r8, r9, [rp, #32]!
-L(lo): mvn r8, r4
- mvn r9, r5
- tst n, n
- bne L(top)
-')
-
-L(end): strd r8, r9, [rp, #8]
-L(wd1): pop { r4-r5,r8-r9 }
- bx r14
-ifelse(UNROLL,4x2,`
-L(dne): strd r8, r9, [rp, #24]
- pop { r4-r5,r8-r9 }
- bx r14
-')
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
deleted file mode 100644
index 2a06532b3e..0000000000
--- a/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1700MHz Cortex-A15 with Neon (in spite of file position) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 15
-
-#define MUL_TOOM22_THRESHOLD 23
-#define MUL_TOOM33_THRESHOLD 90
-#define MUL_TOOM44_THRESHOLD 262
-#define MUL_TOOM6H_THRESHOLD 351
-#define MUL_TOOM8H_THRESHOLD 557
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 43
-#define SQR_TOOM3_THRESHOLD 138
-#define SQR_TOOM4_THRESHOLD 363
-#define SQR_TOOM6_THRESHOLD 517
-#define SQR_TOOM8_THRESHOLD 725
-
-#define MULMID_TOOM42_THRESHOLD 52
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 550 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 550, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 19, 6}, { 39, 7}, { 25, 6}, \
- { 51, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 51, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 99, 9}, { 55,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \
- { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
- { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
- { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 831,12}, { 447,11}, { 895,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
- { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1663,12}, { 895,13}, { 511,12}, { 1087,13}, \
- { 639,12}, { 1407,13}, { 767,12}, { 1599,13}, \
- { 895,14}, { 511,13}, { 1023,12}, { 2111,13}, \
- { 1151,12}, { 2431,13}, { 1279,14}, { 767,13}, \
- { 1535,12}, { 3071,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
- { 2815,12}, { 5631,13}, { 2943,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 137
-#define MUL_FFT_THRESHOLD 5760
-
-#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \
- { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \
- { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \
- { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159,10}, { 335, 9}, { 671,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,10}, { 415,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,11}, { 351,12}, { 191,11}, \
- { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 831,12}, { 447,11}, { 895,12}, { 511,11}, \
- { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \
- { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1663,12}, { 895,13}, \
- { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \
- { 767,12}, { 1727,13}, { 895,14}, { 511,13}, \
- { 1023,12}, { 2047,13}, { 1151,12}, { 2431,13}, \
- { 1279,14}, { 767,13}, { 1535,12}, { 3071,15}, \
- { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \
- { 2175,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
- { 2687,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 139
-#define SQR_FFT_THRESHOLD 4736
-
-#define MULLO_BASECASE_THRESHOLD 9
-#define MULLO_DC_THRESHOLD 39
-#define MULLO_MUL_N_THRESHOLD 11278
-
-#define DC_DIV_QR_THRESHOLD 54
-#define DC_DIVAPPR_Q_THRESHOLD 296
-#define DC_BDIV_QR_THRESHOLD 52
-#define DC_BDIV_Q_THRESHOLD 300
-
-#define INV_MULMOD_BNM1_THRESHOLD 44
-#define INV_NEWTON_THRESHOLD 294
-#define INV_APPR_THRESHOLD 294
-
-#define BINV_NEWTON_THRESHOLD 375
-#define REDC_1_TO_REDC_2_THRESHOLD 102
-#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
-
-#define MU_DIV_QR_THRESHOLD 1718
-#define MU_DIVAPPR_Q_THRESHOLD 1718
-#define MUPI_DIV_QR_THRESHOLD 108
-#define MU_BDIV_QR_THRESHOLD 1528
-#define MU_BDIV_Q_THRESHOLD 1718
-
-#define POWM_SEC_TABLE 3,32,70,416,1464
-
-#define MATRIX22_STRASSEN_THRESHOLD 22
-#define HGCD_THRESHOLD 152
-#define HGCD_APPR_THRESHOLD 230
-#define HGCD_REDUCE_THRESHOLD 3259
-#define GCD_DC_THRESHOLD 702
-#define GCDEXT_DC_THRESHOLD 538
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 18
-#define GET_STR_PRECOMPUTE_THRESHOLD 32
-#define SET_STR_DC_THRESHOLD 119
-#define SET_STR_PRECOMPUTE_THRESHOLD 1063
-
-#define FAC_DSC_THRESHOLD 262
-#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/arm/v7a/cora15/logops_n.asm b/gmp/mpn/arm/v7a/cora15/logops_n.asm
deleted file mode 100644
index 06026143e1..0000000000
--- a/gmp/mpn/arm/v7a/cora15/logops_n.asm
+++ /dev/null
@@ -1,253 +0,0 @@
-dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C and andn ior xor nand iorn nior xnor
-C StrongARM ? ?
-C XScale ? ?
-C Cortex-A7 ? ?
-C Cortex-A8 ? ?
-C Cortex-A9 3.5 3.56
-C Cortex-A15 1.27 1.64
-
-C This is great A15 core register code, but it is a bit large.
-C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 -
-C v6t2 -
-C v7a -
-
-define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
-define(`UNROLL', 4x2) C alternatives: 4 4x2
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-define(`POSTOP')
-
-ifdef(`OPERATION_and_n',`
- define(`func', `mpn_and_n')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_andn_n',`
- define(`func', `mpn_andn_n')
- define(`LOGOP', `bic $1, $2, $3')')
-ifdef(`OPERATION_nand_n',`
- define(`func', `mpn_nand_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_ior_n',`
- define(`func', `mpn_ior_n')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_iorn_n',`
- define(`func', `mpn_iorn_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `bic $1, $3, $2')')
-ifdef(`OPERATION_nior_n',`
- define(`func', `mpn_nior_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_xor_n',`
- define(`func', `mpn_xor_n')
- define(`LOGOP', `eor $1, $2, $3')')
-ifdef(`OPERATION_xnor_n',`
- define(`func', `mpn_xnor_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `eor $1, $2, $3')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-ASM_START()
-PROLOGUE(func)
- push { r4-r9 }
-
-ifelse(FEEDIN_VARIANT,0,`
- ands r6, n, #3
- mov n, n, lsr #2
- beq L(b00a)
- tst r6, #1
- beq L(bx0)
- ldr r5, [up], #4
- ldr r7, [vp], #4
- LOGOP( r9, r5, r7)
- POSTOP( r9)
- str r9, [rp], #4
- tst r6, #2
- beq L(b00)
-L(bx0): ldrd r4, r5, [up, #0]
- ldrd r6, r7, [vp, #0]
- sub rp, rp, #8
- b L(lo)
-L(b00): tst n, n
- beq L(wd1)
-L(b00a):ldrd r4, r5, [up], #-8
- ldrd r6, r7, [vp], #-8
- sub rp, rp, #16
- b L(mid)
-')
-ifelse(FEEDIN_VARIANT,1,`
- and r6, n, #3
- mov n, n, lsr #2
- tst r6, #1
- beq L(bx0)
- ldr r5, [up], #4
- ldr r7, [vp], #4
- LOGOP( r9, r5, r7)
- POSTOP( r9)
- str r9, [rp], #4
-L(bx0): tst r6, #2
- beq L(b00)
- ldrd r4, r5, [up, #0]
- ldrd r6, r7, [vp, #0]
- sub rp, rp, #8
- b L(lo)
-L(b00): tst n, n
- beq L(wd1)
- ldrd r4, r5, [up], #-8
- ldrd r6, r7, [vp], #-8
- sub rp, rp, #16
- b L(mid)
-')
-ifelse(FEEDIN_VARIANT,2,`
- ands r6, n, #3
- mov n, n, lsr #2
- beq L(b00)
- cmp r6, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): ldr r5, [up], #4
- ldr r7, [vp], #4
- LOGOP( r9, r5, r7)
- ldrd r4, r5, [up, #0]
- ldrd r6, r7, [vp, #0]
- POSTOP( r9)
- str r9, [rp], #-4
- b L(lo)
-
-L(b00): ldrd r4, r5, [up], #-8
- ldrd r6, r7, [vp], #-8
- sub rp, rp, #16
- b L(mid)
-
-L(b01): ldr r5, [up], #-4
- ldr r7, [vp], #-4
- LOGOP( r9, r5, r7)
- POSTOP( r9)
- str r9, [rp], #-12
- tst n, n
- beq L(wd1)
-L(gt1): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- b L(mid)
-
-L(b10): ldrd r4, r5, [up]
- ldrd r6, r7, [vp]
- sub rp, rp, #8
- b L(lo)
-')
- ALIGN(16)
-ifelse(UNROLL,4,`
-L(top): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #8]
-L(mid): LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- ldrd r4, r5, [up, #16]!
- ldrd r6, r7, [vp, #16]!
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #16]!
- sub n, n, #1
-L(lo): LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- tst n, n
- bne L(top)
-')
-ifelse(UNROLL,4x2,`
-L(top): ldrd r4, r5, [up, #8]
- ldrd r6, r7, [vp, #8]
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #8]
-L(mid): LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- ldrd r4, r5, [up, #16]
- ldrd r6, r7, [vp, #16]
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #16]
- LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- sub n, n, #2
- tst n, n
- bmi L(dne)
- ldrd r4, r5, [up, #24]
- ldrd r6, r7, [vp, #24]
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #24]
- LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- ldrd r4, r5, [up, #32]!
- ldrd r6, r7, [vp, #32]!
- POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #32]!
-L(lo): LOGOP( r8, r4, r6)
- LOGOP( r9, r5, r7)
- tst n, n
- bne L(top)
-')
-
-L(end): POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #8]
-L(wd1): pop { r4-r9 }
- bx r14
-ifelse(UNROLL,4x2,`
-L(dne): POSTOP( r8)
- POSTOP( r9)
- strd r8, r9, [rp, #24]
- pop { r4-r9 }
- bx r14
-')
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/mul_1.asm b/gmp/mpn/arm/v7a/cora15/mul_1.asm
deleted file mode 100644
index 766ba5c57f..0000000000
--- a/gmp/mpn/arm/v7a/cora15/mul_1.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-dnl ARM mpn_mul_1 optimised for A15.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C StrongARM: -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.25 3.25
-C Cortex-A15 2.25 this
-
-
-C This runs well on A15 but very poorly on A9. By scheduling loads and adds
-C it is possible to get good A9 performance as well, but at the cost of using
-C many more (callee-saves) registers.
-
-C This is armv5 code, optimized for the armv7a cpu A15. Its location in the
-C GMP file structure might be misleading.
-
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`v0', `r3')
-
-ASM_START()
-PROLOGUE(mpn_mul_1c)
- ldr r12, [sp]
- b L(ent)
-EPILOGUE()
-PROLOGUE(mpn_mul_1)
- mov r12, #0
-L(ent): push {r4-r7}
-
- ldr r6, [up], #4
- tst n, #1
- beq L(bx0)
-
-L(bx1): umull r4, r7, r6, v0
- adds r4, r4, r12
- tst n, #2
- beq L(lo1)
- b L(lo3)
-
-L(bx0): umull r4, r5, r6, v0
- adds r4, r4, r12
- tst n, #2
- beq L(lo0)
- b L(lo2)
-
-L(top): ldr r6, [up], #4
- str r4, [rp], #4
- umull r4, r5, r6, v0
- adds r4, r4, r7
-L(lo0): ldr r6, [up], #4
- str r4, [rp], #4
- umull r4, r7, r6, v0
- adcs r4, r4, r5
-L(lo3): ldr r6, [up], #4
- str r4, [rp], #4
- umull r4, r5, r6, v0
- adcs r4, r4, r7
-L(lo2): ldr r6, [up], #4
- str r4, [rp], #4
- umull r4, r7, r6, v0
- adcs r4, r4, r5
-L(lo1): adc r7, r7, #0
- subs n, n, #4
- bgt L(top)
-
- str r4, [rp]
- mov r0, r7
- pop {r4-r7}
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
deleted file mode 100644
index d8cfe3f78f..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-
-ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
-
-include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
deleted file mode 100644
index b48204d926..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-
-ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
-
-include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
deleted file mode 100644
index 16c34a2699..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.25
-C Cortex-A15 2.25
-
-C TODO
-C * Consider using 4-way feed-in code.
-C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
-C insufficiently for A7 and A8.
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`DO_add', `
- define(`ADCSBCS', `adcs $1, $2, $3')
- define(`CLRCY', `cmn r13, #1')
- define(`RETVAL', `adc r0, $1, #0')
- define(`func', mpn_addlsh`'LSH`'_n)')
-ifdef(`DO_sub', `
- define(`ADCSBCS', `sbcs $1, $2, $3')
- define(`CLRCY', `cmp r13, #0')
- define(`RETVAL', `sbc $2, $2, $2
- cmn $2, #1
- adc r0, $1, #0')
- define(`func', mpn_sublsh`'LSH`'_n)')
-ifdef(`DO_rsb', `
- define(`ADCSBCS', `sbcs $1, $3, $2')
- define(`CLRCY', `cmp r13, #0')
- define(`RETVAL', `sbc r0, $1, #0')
- define(`func', mpn_rsblsh`'LSH`'_n)')
-
-
-ASM_START()
-PROLOGUE(func)
- push {r4-r10}
- vmov.i8 d0, #0 C could feed carry through here
- CLRCY
- tst n, #1
- beq L(bb0)
-
-L(bb1): vld1.32 {d3[0]}, [vp]!
- vsli.u32 d0, d3, #LSH
- ldr r12, [up], #4
- vmov.32 r5, d0[0]
- vshr.u32 d0, d3, #32-LSH
- ADCSBCS( r12, r12, r5)
- str r12, [rp], #4
- bics n, n, #1
- beq L(rtn)
-
-L(bb0): tst n, #2
- beq L(b00)
-
-L(b10): vld1.32 {d3}, [vp]!
- vsli.u64 d0, d3, #LSH
- ldmia up!, {r10,r12}
- vmov r4, r5, d0
- vshr.u64 d0, d3, #64-LSH
- ADCSBCS( r10, r10, r4)
- ADCSBCS( r12, r12, r5)
- stmia rp!, {r10,r12}
- bics n, n, #2
- beq L(rtn)
-
-L(b00): vld1.32 {d2}, [vp]!
- vsli.u64 d0, d2, #LSH
- vshr.u64 d1, d2, #64-LSH
- vld1.32 {d3}, [vp]!
- vsli.u64 d1, d3, #LSH
- vmov r6, r7, d0
- vshr.u64 d0, d3, #64-LSH
- sub n, n, #4
- tst n, n
- beq L(end)
-
- ALIGN(16)
-L(top): ldmia up!, {r8,r9,r10,r12}
- vld1.32 {d2}, [vp]!
- vsli.u64 d0, d2, #LSH
- vmov r4, r5, d1
- vshr.u64 d1, d2, #64-LSH
- ADCSBCS( r8, r8, r6)
- ADCSBCS( r9, r9, r7)
- vld1.32 {d3}, [vp]!
- vsli.u64 d1, d3, #LSH
- vmov r6, r7, d0
- vshr.u64 d0, d3, #64-LSH
- ADCSBCS( r10, r10, r4)
- ADCSBCS( r12, r12, r5)
- stmia rp!, {r8,r9,r10,r12}
- sub n, n, #4
- tst n, n
- bne L(top)
-
-L(end): ldmia up!, {r8,r9,r10,r12}
- vmov r4, r5, d1
- ADCSBCS( r8, r8, r6)
- ADCSBCS( r9, r9, r7)
- ADCSBCS( r10, r10, r4)
- ADCSBCS( r12, r12, r5)
- stmia rp!, {r8,r9,r10,r12}
-L(rtn): vmov.32 r0, d0[0]
- RETVAL( r0, r1)
- pop {r4-r10}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/com.asm b/gmp/mpn/arm/v7a/cora15/neon/com.asm
deleted file mode 100644
index 9e7a629287..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/com.asm
+++ /dev/null
@@ -1,97 +0,0 @@
-dnl ARM Neon mpn_com optimised for A15.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM ?
-C XScale ?
-C Cortex-A8 ?
-C Cortex-A9 2.1
-C Cortex-A15 0.65
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-
-ASM_START()
-PROLOGUE(mpn_com)
- cmp n, #7
- ble L(bc)
-
-C Perform a few initial operation until rp is 128-bit aligned
- tst rp, #4
- beq L(al1)
- vld1.32 {d0[0]}, [up]!
- sub n, n, #1
- vmvn d0, d0
- vst1.32 {d0[0]}, [rp]!
-L(al1): tst rp, #8
- beq L(al2)
- vld1.32 {d0}, [up]!
- sub n, n, #2
- vmvn d0, d0
- vst1.32 {d0}, [rp:64]!
-L(al2): vld1.32 {q2}, [up]!
- subs n, n, #12
- blt L(end)
-
- ALIGN(16)
-L(top): vld1.32 {q0}, [up]!
- vmvn q2, q2
- subs n, n, #8
- vst1.32 {q2}, [rp:128]!
- vld1.32 {q2}, [up]!
- vmvn q0, q0
- vst1.32 {q0}, [rp:128]!
- bge L(top)
-
-L(end): vmvn q2, q2
- vst1.32 {q2}, [rp:128]!
-
-C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we
-C arrive here via L(bc)
-L(bc): tst n, #4
- beq L(tl1)
- vld1.32 {q0}, [up]!
- vmvn q0, q0
- vst1.32 {q0}, [rp]!
-L(tl1): tst n, #2
- beq L(tl2)
- vld1.32 {d0}, [up]!
- vmvn d0, d0
- vst1.32 {d0}, [rp]!
-L(tl2): tst n, #1
- beq L(tl3)
- vld1.32 {d0[0]}, [up]
- vmvn d0, d0
- vst1.32 {d0[0]}, [rp]
-L(tl3): bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
deleted file mode 100644
index 98fe535def..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
+++ /dev/null
@@ -1,110 +0,0 @@
-dnl ARM Neon mpn_copyd optimised for A15.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.75 slower than core register code
-C Cortex-A15 0.52
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-
-ASM_START()
-PROLOGUE(mpn_copyd)
- add rp, rp, n, lsl #2
- add up, up, n, lsl #2
-
- cmp n, #7
- ble L(bc)
-
-C Copy until rp is 128-bit aligned
- tst rp, #4
- beq L(al1)
- sub up, up, #4
- vld1.32 {d22[0]}, [up]
- sub n, n, #1
- sub rp, rp, #4
- vst1.32 {d22[0]}, [rp]
-L(al1): tst rp, #8
- beq L(al2)
- sub up, up, #8
- vld1.32 {d22}, [up]
- sub n, n, #2
- sub rp, rp, #8
- vst1.32 {d22}, [rp:64]
-L(al2): sub up, up, #16
- vld1.32 {d26-d27}, [up]
- subs n, n, #12
- sub rp, rp, #16 C offset rp for loop
- blt L(end)
-
- sub up, up, #16 C offset up for loop
- mov r12, #-16
-
- ALIGN(16)
-L(top): vld1.32 {d22-d23}, [up], r12
- vst1.32 {d26-d27}, [rp:128], r12
- vld1.32 {d26-d27}, [up], r12
- vst1.32 {d22-d23}, [rp:128], r12
- subs n, n, #8
- bge L(top)
-
- add up, up, #16 C undo up offset
- C rp offset undoing folded
-L(end): vst1.32 {d26-d27}, [rp:128]
-
-C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
-C arrive here via L(bc)
-L(bc): tst n, #4
- beq L(tl1)
- sub up, up, #16
- vld1.32 {d22-d23}, [up]
- sub rp, rp, #16
- vst1.32 {d22-d23}, [rp]
-L(tl1): tst n, #2
- beq L(tl2)
- sub up, up, #8
- vld1.32 {d22}, [up]
- sub rp, rp, #8
- vst1.32 {d22}, [rp]
-L(tl2): tst n, #1
- beq L(tl3)
- sub up, up, #4
- vld1.32 {d22[0]}, [up]
- sub rp, rp, #4
- vst1.32 {d22[0]}, [rp]
-L(tl3): bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
deleted file mode 100644
index 2e05afe5e8..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
+++ /dev/null
@@ -1,90 +0,0 @@
-dnl ARM Neon mpn_copyi optimised for A15.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 1.75 slower than core register code
-C Cortex-A15 0.52
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-
-ASM_START()
-PROLOGUE(mpn_copyi)
- cmp n, #7
- ble L(bc)
-
-C Copy until rp is 128-bit aligned
- tst rp, #4
- beq L(al1)
- vld1.32 {d22[0]}, [up]!
- sub n, n, #1
- vst1.32 {d22[0]}, [rp]!
-L(al1): tst rp, #8
- beq L(al2)
- vld1.32 {d22}, [up]!
- sub n, n, #2
- vst1.32 {d22}, [rp:64]!
-L(al2): vld1.32 {d26-d27}, [up]!
- subs n, n, #12
- blt L(end)
-
- ALIGN(16)
-L(top): vld1.32 {d22-d23}, [up]!
- vst1.32 {d26-d27}, [rp:128]!
- vld1.32 {d26-d27}, [up]!
- vst1.32 {d22-d23}, [rp:128]!
- subs n, n, #8
- bge L(top)
-
-L(end): vst1.32 {d26-d27}, [rp:128]!
-
-C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
-C arrive here via L(bc)
-L(bc): tst n, #4
- beq L(tl1)
- vld1.32 {d22-d23}, [up]!
- vst1.32 {d22-d23}, [rp]!
-L(tl1): tst n, #2
- beq L(tl2)
- vld1.32 {d22}, [up]!
- vst1.32 {d22}, [rp]!
-L(tl2): tst n, #1
- beq L(tl3)
- vld1.32 {d22[0]}, [up]
- vst1.32 {d22[0]}, [rp]
-L(tl3): bx lr
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
deleted file mode 100644
index 2c11d6debd..0000000000
--- a/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
+++ /dev/null
@@ -1,177 +0,0 @@
-dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C StrongARM -
-C XScale -
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 4-5
-C Cortex-A15 2.5
-
-C TODO
-C * Try to make this smaller, its size (384 bytes) is excessive.
-C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
-C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
-C insufficiently for A7 and A8.
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`vp', `r2')
-define(`n', `r3')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(`ADDSUBS', `adds $1, $2, $3')
- define(`ADCSBCS', `adcs $1, $2, $3')
- define(`IFADD', `$1')
- define(`IFSUB', `')
- define(`func', mpn_rsh1add_n)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(`ADDSUBS', `subs $1, $2, $3')
- define(`ADCSBCS', `sbcs $1, $2, $3')
- define(`IFADD', `')
- define(`IFSUB', `$1')
- define(`func', mpn_rsh1sub_n)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
-
-ASM_START()
-PROLOGUE(func)
- push {r4-r10}
-
- ands r4, n, #3
- beq L(b00)
- cmp r4, #2
- blo L(b01)
- beq L(b10)
-
-L(b11): ldmia up!, {r9,r10,r12}
- ldmia vp!, {r5,r6,r7}
- ADDSUBS( r9, r9, r5)
- vmov d4, r9, r9
- ADCSBCS( r10, r10, r6)
- ADCSBCS( r12, r12, r7)
- vshr.u64 d3, d4, #1
- vmov d1, r10, r12
- vsli.u64 d3, d1, #31
- vshr.u64 d2, d1, #1
- vst1.32 d3[0], [rp]!
- bics n, n, #3
- beq L(wd2)
-L(gt3): ldmia up!, {r8,r9,r10,r12}
- ldmia vp!, {r4,r5,r6,r7}
- b L(mi0)
-
-L(b10): ldmia up!, {r10,r12}
- ldmia vp!, {r6,r7}
- ADDSUBS( r10, r10, r6)
- ADCSBCS( r12, r12, r7)
- vmov d4, r10, r12
- bics n, n, #2
- vshr.u64 d2, d4, #1
- beq L(wd2)
-L(gt2): ldmia up!, {r8,r9,r10,r12}
- ldmia vp!, {r4,r5,r6,r7}
- b L(mi0)
-
-L(b01): ldr r12, [up], #4
- ldr r7, [vp], #4
- ADDSUBS( r12, r12, r7)
- vmov d4, r12, r12
- bics n, n, #1
- bne L(gt1)
- mov r5, r12, lsr #1
-IFADD(` adc r1, n, #0')
-IFSUB(` adc r1, n, #1')
- bfi r5, r1, #31, #1
- str r5, [rp]
- and r0, r12, #1
- pop {r4-r10}
- bx r14
-L(gt1): ldmia up!, {r8,r9,r10,r12}
- ldmia vp!, {r4,r5,r6,r7}
- vshr.u64 d2, d4, #1
- ADCSBCS( r8, r8, r4)
- ADCSBCS( r9, r9, r5)
- vmov d0, r8, r9
- ADCSBCS( r10, r10, r6)
- ADCSBCS( r12, r12, r7)
- vsli.u64 d2, d0, #31
- vshr.u64 d3, d0, #1
- vst1.32 d2[0], [rp]!
- b L(mi1)
-
-L(b00): ldmia up!, {r8,r9,r10,r12}
- ldmia vp!, {r4,r5,r6,r7}
- ADDSUBS( r8, r8, r4)
- ADCSBCS( r9, r9, r5)
- vmov d4, r8, r9
- ADCSBCS( r10, r10, r6)
- ADCSBCS( r12, r12, r7)
- vshr.u64 d3, d4, #1
- b L(mi1)
-
- ALIGN(16)
-L(top): ldmia up!, {r8,r9,r10,r12}
- ldmia vp!, {r4,r5,r6,r7}
- vsli.u64 d3, d1, #63
- vshr.u64 d2, d1, #1
- vst1.32 d3, [rp]!
-L(mi0): ADCSBCS( r8, r8, r4)
- ADCSBCS( r9, r9, r5)
- vmov d0, r8, r9
- ADCSBCS( r10, r10, r6)
- ADCSBCS( r12, r12, r7)
- vsli.u64 d2, d0, #63
- vshr.u64 d3, d0, #1
- vst1.32 d2, [rp]!
-L(mi1): vmov d1, r10, r12
- sub n, n, #4
- tst n, n
- bne L(top)
-
-L(end): vsli.u64 d3, d1, #63
- vshr.u64 d2, d1, #1
- vst1.32 d3, [rp]!
-L(wd2): vmov r4, r5, d2
-IFADD(` adc r1, n, #0')
-IFSUB(` adc r1, n, #1')
- bfi r5, r1, #31, #1
- stm rp, {r4,r5}
-
-L(rtn): vmov.32 r0, d4[0]
- and r0, r0, #1
- pop {r4-r10}
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora15/submul_1.asm b/gmp/mpn/arm/v7a/cora15/submul_1.asm
deleted file mode 100644
index ed7bfe820b..0000000000
--- a/gmp/mpn/arm/v7a/cora15/submul_1.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl ARM mpn_submul_1 optimised for A15.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C StrongARM: -
-C XScale ?
-C Cortex-A7 ?
-C Cortex-A8 ?
-C Cortex-A9 5.75 3.75
-C Cortex-A15 2.32 this
-
-C This code uses umlal and umaal for adding in the rp[] data, keeping the
-C recurrency path separate from any multiply instructions. It performs well on
-C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
-C code.
-C
-C We don't use r12 due to ldrd and strd limitations.
-C
-C This loop complements U on the fly,
-C U' = B^n - 1 - U
-C and then uses that
-C R - U*v = R + U'*v + v - B^n v
-
-C Architecture requirements:
-C v5 -
-C v5t -
-C v5te ldrd strd
-C v6 umaal
-C v6t2 -
-C v7a -
-
-define(`rp', `r0')
-define(`up', `r1')
-define(`n', `r2')
-define(`v0', `r3')
-
-define(`w0', `r10') define(`w1', `r11')
-define(`u0', `r8') define(`u1', `r9')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- sub sp, sp, #32
- strd r10, r11, [sp, #24]
- strd r8, r9, [sp, #16]
- strd r6, r7, [sp, #8]
- strd r4, r5, [sp, #0]
-C push { r4-r11 }
-
- ands r6, n, #3
- sub n, n, #3
- beq L(b00)
- cmp r6, #2
- bcc L(b01)
- beq L(b10)
-
-L(b11): mov r6, #0
- ldr u1, [up], #-4
- ldr w1, [rp], #-16
- mvn u1, u1
- adds r7, v0, #0
- b L(mid)
-
-L(b00): ldrd u0, u1, [up]
- ldrd w0, w1, [rp], #-12
- mvn u0, u0
- mvn u1, u1
- mov r6, v0
- umaal w0, r6, u0, v0
- cmn r13, #0 C carry clear
- mov r7, #0
- str w0, [rp, #12]
- b L(mid)
-
-L(b10): ldrd u0, u1, [up], #8
- ldrd w0, w1, [rp]
- mvn u0, u0
- mvn u1, u1
- mov r4, v0
- umaal w0, r4, u0, v0
- mov r5, #0
- str w0, [rp], #-4
- umlal w1, r5, u1, v0
- adds n, n, #0
- bmi L(end)
- b L(top)
-
-L(b01): ldr u1, [up], #4
- ldr w1, [rp], #-8
- mvn u1, u1
- mov r5, v0
- mov r4, #0
- umaal w1, r5, u1, v0
- tst n, n
- bmi L(end)
-
-C ALIGN(16)
-L(top): ldrd u0, u1, [up, #0]
- adcs r4, r4, w1
- mvn u0, u0
- ldrd w0, w1, [rp, #12]
- mvn u1, u1
- mov r6, #0
- umlal w0, r6, u0, v0 C 1 2
- adcs r5, r5, w0
- mov r7, #0
- strd r4, r5, [rp, #8]
-L(mid): umaal w1, r7, u1, v0 C 2 3
- ldrd u0, u1, [up, #8]
- add up, up, #16
- adcs r6, r6, w1
- mvn u0, u0
- ldrd w0, w1, [rp, #20]
- mvn u1, u1
- mov r4, #0
- umlal w0, r4, u0, v0 C 3 4
- adcs r7, r7, w0
- mov r5, #0
- strd r6, r7, [rp, #16]!
- sub n, n, #4
- umlal w1, r5, u1, v0 C 0 1
- tst n, n
- bpl L(top)
-
-L(end): adcs r4, r4, w1
- str r4, [rp, #8]
- adc r0, r5, #0
- sub r0, v0, r0
- pop { r4-r11 }
- bx r14
-EPILOGUE()
diff --git a/gmp/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
deleted file mode 100644
index 9660257820..0000000000
--- a/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1000MHz Cortex-A9 */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-#define MUL_TOOM22_THRESHOLD 45
-#define MUL_TOOM33_THRESHOLD 129
-#define MUL_TOOM44_THRESHOLD 387
-#define MUL_TOOM6H_THRESHOLD 517
-#define MUL_TOOM8H_THRESHOLD 774
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 222
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 235
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 208
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 54
-#define SQR_TOOM3_THRESHOLD 181
-#define SQR_TOOM4_THRESHOLD 490
-#define SQR_TOOM6_THRESHOLD 656
-#define SQR_TOOM8_THRESHOLD 0 /* always */
-
-#define MULMID_TOOM42_THRESHOLD 64
-
-#define MULMOD_BNM1_THRESHOLD 26
-#define SQRMOD_BNM1_THRESHOLD 28
-
-#define MUL_FFT_MODF_THRESHOLD 624 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 624, 5}, { 28, 6}, { 15, 5}, { 34, 6}, \
- { 18, 5}, { 37, 6}, { 28, 7}, { 15, 6}, \
- { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
- { 43, 7}, { 23, 6}, { 47, 7}, { 25, 6}, \
- { 51, 7}, { 27, 6}, { 55, 7}, { 29, 8}, \
- { 15, 7}, { 31, 6}, { 63, 7}, { 37, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
- { 27, 7}, { 57, 9}, { 15, 8}, { 31, 7}, \
- { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
- { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
- { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399, 9}, { 799,10}, { 415,11}, \
- { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \
- { 1023,10}, { 543,11}, { 287,10}, { 575, 9}, \
- { 1151,11}, { 319,10}, { 671,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
- { 831,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 607,12}, { 319,11}, { 735,12}, \
- { 383,11}, { 863,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,12}, { 639,11}, { 1279,12}, { 703,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
- { 1663,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1023,11}, { 2047,12}, { 1215,13}, { 639,12}, \
- { 1407,13}, { 767,12}, { 1663,13}, { 895,12}, \
- { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \
- { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
- { 1407,14}, { 767,13}, { 1535,12}, { 3071,13}, \
- { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \
- { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \
- { 4351,13}, { 2431,14}, { 1279,13}, { 2559,12}, \
- { 5119,13}, { 2815,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 160
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 560, 5}, { 19, 4}, { 39, 5}, { 21, 4}, \
- { 43, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
- { 17, 5}, { 35, 6}, { 36, 7}, { 19, 6}, \
- { 40, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
- { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
- { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \
- { 23, 8}, { 55, 9}, { 31, 8}, { 71, 9}, \
- { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
- { 111,11}, { 63,10}, { 159,11}, { 95,10}, \
- { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \
- { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \
- { 543,11}, { 287,10}, { 575, 9}, { 1151,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 383,10}, { 799,11}, \
- { 415,10}, { 831,13}, { 127,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
- { 1151,11}, { 607,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 735,12}, { 383,11}, { 863,12}, \
- { 447,11}, { 959,12}, { 511,11}, { 1087,12}, \
- { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
- { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
- { 1791,12}, { 959,13}, { 511,12}, { 1023,11}, \
- { 2047,12}, { 1215,13}, { 639,12}, { 1407,13}, \
- { 767,12}, { 1663,13}, { 895,12}, { 1791,14}, \
- { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
- { 2431,13}, { 1279,12}, { 2559,13}, { 1407,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \
- { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \
- { 2431,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
- { 2815,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 167
-#define SQR_FFT_THRESHOLD 5312
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 38
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 42
-#define DC_DIVAPPR_Q_THRESHOLD 100
-#define DC_BDIV_QR_THRESHOLD 43
-#define DC_BDIV_Q_THRESHOLD 104
-
-#define INV_MULMOD_BNM1_THRESHOLD 98
-#define INV_NEWTON_THRESHOLD 138
-#define INV_APPR_THRESHOLD 133
-
-#define BINV_NEWTON_THRESHOLD 333
-#define REDC_1_TO_REDC_2_THRESHOLD 2
-#define REDC_2_TO_REDC_N_THRESHOLD 142
-
-#define MU_DIV_QR_THRESHOLD 2350
-#define MU_DIVAPPR_Q_THRESHOLD 2259
-#define MUPI_DIV_QR_THRESHOLD 70
-#define MU_BDIV_QR_THRESHOLD 2089
-#define MU_BDIV_Q_THRESHOLD 2172
-
-#define POWM_SEC_TABLE 37,48,81,615,1925
-
-#define MATRIX22_STRASSEN_THRESHOLD 22
-#define HGCD_THRESHOLD 64
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 4284
-#define GCD_DC_THRESHOLD 416
-#define GCDEXT_DC_THRESHOLD 298
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 18
-#define GET_STR_PRECOMPUTE_THRESHOLD 33
-#define SET_STR_DC_THRESHOLD 140
-#define SET_STR_PRECOMPUTE_THRESHOLD 748
-
-#define FAC_DSC_THRESHOLD 309
-#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/arm64/aors_n.asm b/gmp/mpn/arm64/aors_n.asm
deleted file mode 100644
index a880cd35cf..0000000000
--- a/gmp/mpn/arm64/aors_n.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-dnl ARM64 mpn_add_n and mpn_sub_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`vp', `x2')
-define(`n', `x3')
-
-ifdef(`OPERATION_add_n', `
- define(`ADDSUBC', adcs)
- define(`CLRCY', `cmn xzr, xzr')
- define(`SETCY', `cmp $1, #1')
- define(`RETVAL', `adc x0, xzr, xzr')
- define(`func', mpn_add_n)
- define(`func_nc', mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(`ADDSUBC', sbcs)
- define(`CLRCY', `cmp xzr, xzr')
- define(`SETCY', `subs $1, xzr, $1')
- define(`RETVAL', `sbc x0, xzr, xzr
- and x0, x0, #1')
- define(`func', mpn_sub_n)
- define(`func_nc', mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ASM_START()
-PROLOGUE(func_nc)
- SETCY( x4)
- b L(ent)
-EPILOGUE()
-PROLOGUE(func)
- CLRCY
-L(ent): tbz n, #0, L(b0)
-
- ldr x4, [up],#8
- ldr x6, [vp],#8
- sub n, n, #1
- ADDSUBC x8, x4, x6
- str x8, [rp],#8
- cbz n, L(rt)
-
-L(b0): ldp x4, x5, [up],#16
- ldp x6, x7, [vp],#16
- sub n, n, #2
- ADDSUBC x8, x4, x6
- ADDSUBC x9, x5, x7
- cbz n, L(end)
-
-L(top): ldp x4, x5, [up],#16
- ldp x6, x7, [vp],#16
- sub n, n, #2
- stp x8, x9, [rp],#16
- ADDSUBC x8, x4, x6
- ADDSUBC x9, x5, x7
- cbnz n, L(top)
-
-L(end): stp x8, x9, [rp]
-L(rt): RETVAL
- ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/aorsmul_1.asm b/gmp/mpn/arm64/aorsmul_1.asm
deleted file mode 100644
index bf765a7f77..0000000000
--- a/gmp/mpn/arm64/aorsmul_1.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-dnl ARM64 mpn_submul_1
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`n', `x2')
-define(`v0', `x3')
-
-ifdef(`OPERATION_addmul_1', `
- define(`ADDSUB', adds)
- define(`ADDSUBC', adcs)
- define(`COND', `cc')
- define(`func', mpn_addmul_1)')
-ifdef(`OPERATION_submul_1', `
- define(`ADDSUB', subs)
- define(`ADDSUBC', sbcs)
- define(`COND', `cs')
- define(`func', mpn_submul_1)')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-PROLOGUE(func)
- mov x15, #0
-
- tbz n, #0, L(1)
-
- ldr x4, [up],#8
- mul x8, x4, v0
- umulh x12, x4, v0
- adds x8, x8, x15
- ldr x4, [rp,#0]
- ADDSUB x8, x4, x8
- csinc x15, x12, x12, COND
- str x8, [rp],#8
-
-L(1): tbz n, #1, L(2)
-
- ldp x4, x5, [up],#16
- mul x8, x4, v0
- umulh x12, x4, v0
- mul x9, x5, v0
- umulh x13, x5, v0
- adds x8, x8, x15
- adcs x9, x9, x12
- ldp x4, x5, [rp,#0]
- adc x15, x13, xzr
- sub n, n, #1
- ADDSUB x8, x4, x8
- ADDSUBC x9, x5, x9
- csinc x15, x15, x15, COND
- stp x8, x9, [rp],#16
-
-L(2): lsr n, n, 2
- cbz n, L(end)
-
-L(top): ldp x4, x5, [up],#16
- ldp x6, x7, [up],#16
- mul x8, x4, v0
- umulh x12, x4, v0
- mul x9, x5, v0
- umulh x13, x5, v0
- adds x8, x8, x15
- mul x10, x6, v0
- umulh x14, x6, v0
- adcs x9, x9, x12
- mul x11, x7, v0
- umulh x15, x7, v0
- adcs x10, x10, x13
- ldp x4, x5, [rp,#0]
- adcs x11, x11, x14
- ldp x6, x7, [rp,#16]
- adc x15, x15, xzr
- sub n, n, #1
- ADDSUB x8, x4, x8
- ADDSUBC x9, x5, x9
- ADDSUBC x10, x6, x10
- ADDSUBC x11, x7, x11
- stp x8, x9, [rp],#16
- csinc x15, x15, x15, COND
- stp x10, x11, [rp],#16
- cbnz n, L(top)
-
-L(end): mov x0, x15
- ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/cnd_aors_n.asm b/gmp/mpn/arm64/cnd_aors_n.asm
deleted file mode 100644
index e7836500d5..0000000000
--- a/gmp/mpn/arm64/cnd_aors_n.asm
+++ /dev/null
@@ -1,99 +0,0 @@
-dnl ARM64 mpn_cnd_add_n, mpn_cnd_sub_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`cnd', `x0')
-define(`rp', `x1')
-define(`up', `x2')
-define(`vp', `x3')
-define(`n', `x4')
-
-ifdef(`OPERATION_cnd_add_n', `
- define(`ADDSUBC', adcs)
- define(`CLRCY', `cmn xzr, xzr')
- define(`RETVAL', `adc x0, xzr, xzr')
- define(func, mpn_cnd_add_n)')
-ifdef(`OPERATION_cnd_sub_n', `
- define(`ADDSUBC', sbcs)
- define(`CLRCY', `cmp xzr, xzr')
- define(`RETVAL', `sbc x0, xzr, xzr
- and x0, x0, #1')
- define(func, mpn_cnd_sub_n)')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- cmp cnd, #1
- sbc cnd, cnd, cnd
-
- CLRCY C really only needed for n = 0 (mod 4)
-
- tbz n, #0, L(1)
- ldr x10, [up], #8
- ldr x12, [vp], #8
- bic x6, x12, cnd
- ADDSUBC x8, x10, x6
- sub n, n, #1
- str x8, [rp], #8
- cbz n, L(rt)
-
-L(1): ldp x10, x11, [up], #16
- ldp x12, x13, [vp], #16
- sub n, n, #2
- cbz n, L(end)
-
-L(top): bic x6, x12, cnd
- bic x7, x13, cnd
- ldp x12, x13, [vp], #16
- ADDSUBC x8, x10, x6
- ADDSUBC x9, x11, x7
- ldp x10, x11, [up], #16
- sub n, n, #2
- stp x8, x9, [rp], #16
- cbnz n, L(top)
-
-L(end): bic x6, x12, cnd
- bic x7, x13, cnd
- ADDSUBC x8, x10, x6
- ADDSUBC x9, x11, x7
- stp x8, x9, [rp]
-L(rt): RETVAL
- ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/copyd.asm b/gmp/mpn/arm64/copyd.asm
deleted file mode 100644
index bb477716e5..0000000000
--- a/gmp/mpn/arm64/copyd.asm
+++ /dev/null
@@ -1,93 +0,0 @@
-dnl ARM64 mpn_copyd.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`n', `x2')
-
-ASM_START()
-PROLOGUE(mpn_copyd)
- add rp, rp, n, lsl #3
- add up, up, n, lsl #3
-
- cmp n, #3
- b.le L(bc)
-
-C Copy until rp is 128-bit aligned
- tbz rp, #3, L(al2)
- sub up, up, #8
- ld1 {v22.1d}, [up]
- sub n, n, #1
- sub rp, rp, #8
- st1 {v22.1d}, [rp]
-
-L(al2): sub up, up, #16
- ld1 {v26.2d}, [up]
- subs n, n, #6
- sub rp, rp, #16 C offset rp for loop
- b.lt L(end)
-
- sub up, up, #16 C offset up for loop
- mov x12, #-16
-
- ALIGN(16)
-L(top): ld1 {v22.2d}, [up], x12
- st1 {v26.2d}, [rp], x12
- ld1 {v26.2d}, [up], x12
- st1 {v22.2d}, [rp], x12
- subs n, n, #4
- b.ge L(top)
-
- add up, up, #16 C undo up offset
-
-L(end): st1 {v26.2d}, [rp]
-
-C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we
-C arrive here via L(bc)
-L(bc): tbz n, #1, L(tl1)
- sub up, up, #16
- ld1 {v22.2d}, [up]
- sub rp, rp, #16
- st1 {v22.2d}, [rp]
-L(tl1): tbz n, #0, L(tl2)
- sub up, up, #8
- ld1 {v22.1d}, [up]
- sub rp, rp, #8
- st1 {v22.1d}, [rp]
-L(tl2): ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/copyi.asm b/gmp/mpn/arm64/copyi.asm
deleted file mode 100644
index 8f7dbd4f52..0000000000
--- a/gmp/mpn/arm64/copyi.asm
+++ /dev/null
@@ -1,77 +0,0 @@
-dnl ARM64 mpn_copyi.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`n', `x2')
-
-ASM_START()
-PROLOGUE(mpn_copyi)
- cmp n, #3
- b.le L(bc)
-
-C Copy until rp is 128-bit aligned
- tbz rp, #3, L(al2)
- ld1 {v22.1d}, [up], #8
- sub n, n, #1
- st1 {v22.1d}, [rp], #8
-
-L(al2): ld1 {v26.2d}, [up], #16
- subs n, n, #6
- b.lt L(end)
-
- ALIGN(16)
-L(top): ld1 {v22.2d}, [up], #16
- st1 {v26.2d}, [rp], #16
- ld1 {v26.2d}, [up], #16
- st1 {v22.2d}, [rp], #16
- subs n, n, #4
- b.ge L(top)
-
-L(end): st1 {v26.2d}, [rp], #16
-
-C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we
-C arrive here via L(bc)
-L(bc): tbz n, #1, L(tl1)
- ld1 {v22.2d}, [up], #16
- st1 {v22.2d}, [rp], #16
-L(tl1): tbz n, #0, L(tl2)
- ld1 {v22.1d}, [up]
- st1 {v22.1d}, [rp]
-L(tl2): ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/gcd_1.asm b/gmp/mpn/arm64/gcd_1.asm
deleted file mode 100644
index d231dbcbb9..0000000000
--- a/gmp/mpn/arm64/gcd_1.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-dnl ARM v6t2 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-changecom(@&*$)
-
-C cycles/bit (approx)
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-C TODO
-C * Optimise inner-loop better.
-C * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up', `x0')
-define(`n', `x1')
-define(`v0', `x2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
- `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- stp x29, x30, [sp,#-32]!
- ldr x3, [up] C U low limb
- stp x19, x20, [sp,#16]
-
- orr x3, x3, v0
- rbit x4, x3
- clz x20, x4 C min(ctz(u0),ctz(v0))
-
- rbit x12, v0
- clz x12, x12
- lsr v0, v0, x12
-
- mov x19, v0
-
- cmp n, #1
- bne L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- ldr x3, [up]
- cmp v0, x3, lsr #BMOD_THRES_LOG2
- bhi L(red1)
-
-L(bmod):mov x3, #0 C carry argument
- bl mpn_modexact_1c_odd
- b L(red0)
-
-L(nby1):cmp n, #BMOD_1_TO_MOD_1_THRESHOLD
- blo L(bmod)
-
- bl mpn_mod_1
-
-L(red0):mov x3, x0
-L(red1):cmp x3, #0
- rbit x12, x3
- clz x12, x12
- bne L(mid)
- b L(end)
-
- ALIGN(8)
-L(top):
-ifelse(1,1,`
-C This shorter variant makes full use of armv8 insns
- csneg x3, x1, x1, cs C if x-y < 0
- csel x19, x4, x19, cs C use x,y-x
-L(mid): lsr x4, x3, x12 C
- subs x1, x19, x4 C
-',`
-C This variant is akin to the 32-bit v6t2 code
- csel x3, x1, x3, cs C if x-y < 0
- csel x19, x0, x19, cs C use x,y-x
-L(mid): lsr x3, x3, x12 C
- mov x0, x3 C
- subs x1, x19, x3 C
- sub x3, x3, x19 C
-')
- rbit x12, x1
- clz x12, x12 C
- bne L(top) C
-
-L(end): lsl x0, x19, x20
- ldp x19, x20, [sp,#16]
- ldp x29, x30, [sp],#32
- ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/invert_limb.asm b/gmp/mpn/arm64/invert_limb.asm
deleted file mode 100644
index 2302d047e5..0000000000
--- a/gmp/mpn/arm64/invert_limb.asm
+++ /dev/null
@@ -1,83 +0,0 @@
-dnl ARM64 mpn_invert_limb -- Invert a normalized limb.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-C Compiler generated, mildly edited. Could surely be further optimised.
-
-ASM_START()
-PROLOGUE(mpn_invert_limb)
- lsr x2, x0, 54
- adrp x1, approx_tab
- and x2, x2, #0x1fe
- add x1, x1, :lo12:approx_tab
- ldrh w3, [x1,x2]
- lsr x4, x0, 24
- add x4, x4, 1
- ubfiz x2, x3, 11, 16
- umull x3, w3, w3
- mul x3, x3, x4
- sub x2, x2, #1
- sub x2, x2, x3, lsr 40
- lsl x3, x2, 60
- mul x1, x2, x2
- msub x1, x1, x4, x3
- lsl x2, x2, 13
- add x1, x2, x1, lsr 47
- and x2, x0, 1
- neg x3, x2
- and x3, x3, x1, lsr 1
- add x2, x2, x0, lsr 1
- msub x2, x1, x2, x3
- umulh x2, x2, x1
- lsl x1, x1, 31
- add x1, x1, x2, lsr 1
- mul x3, x1, x0
- umulh x2, x1, x0
- adds x4, x3, x0
- adc x0, x2, x0
- sub x0, x1, x0
- ret
-EPILOGUE()
-
- RODATA
- ALIGN(2)
- TYPE( approx_tab, object)
- SIZE( approx_tab, 512)
-approx_tab:
-forloop(i,256,512-1,dnl
-` .hword eval(0x7fd00/i)
-')dnl
diff --git a/gmp/mpn/arm64/logops_n.asm b/gmp/mpn/arm64/logops_n.asm
deleted file mode 100644
index 0f75700cfd..0000000000
--- a/gmp/mpn/arm64/logops_n.asm
+++ /dev/null
@@ -1,106 +0,0 @@
-dnl ARM64 mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-changecom(@&*$)
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`vp', `x2')
-define(`n', `x3')
-
-define(`POSTOP', `dnl')
-
-ifdef(`OPERATION_and_n',`
- define(`func', `mpn_and_n')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_andn_n',`
- define(`func', `mpn_andn_n')
- define(`LOGOP', `bic $1, $2, $3')')
-ifdef(`OPERATION_nand_n',`
- define(`func', `mpn_nand_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `and $1, $2, $3')')
-ifdef(`OPERATION_ior_n',`
- define(`func', `mpn_ior_n')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_iorn_n',`
- define(`func', `mpn_iorn_n')
- define(`LOGOP', `orn $1, $2, $3')')
-ifdef(`OPERATION_nior_n',`
- define(`func', `mpn_nior_n')
- define(`POSTOP', `mvn $1, $1')
- define(`LOGOP', `orr $1, $2, $3')')
-ifdef(`OPERATION_xor_n',`
- define(`func', `mpn_xor_n')
- define(`LOGOP', `eor $1, $2, $3')')
-ifdef(`OPERATION_xnor_n',`
- define(`func', `mpn_xnor_n')
- define(`LOGOP', `eon $1, $2, $3')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-ASM_START()
-PROLOGUE(func)
- tbz n, #0, L(b0)
-
- ldr x4, [up],#8
- ldr x6, [vp],#8
- sub n, n, #1
- LOGOP( x8, x4, x6)
- POSTOP( x8)
- str x8, [rp],#8
- cbz n, L(rtn)
-
-L(b0): ldp x4, x5, [up],#16
- ldp x6, x7, [vp],#16
- sub n, n, #2
- b L(mid)
-
-L(top): ldp x4, x5, [up],#16
- ldp x6, x7, [vp],#16
- sub n, n, #2
- stp x8, x9, [rp],#16
-L(mid): LOGOP( x8, x4, x6)
- LOGOP( x9, x5, x7)
- POSTOP( x8)
- POSTOP( x9)
- cbnz n, L(top)
-
- stp x8, x9, [rp],#16
-L(rtn): ret
-EPILOGUE()
diff --git a/gmp/mpn/arm64/mul_1.asm b/gmp/mpn/arm64/mul_1.asm
deleted file mode 100644
index c0c2570f0d..0000000000
--- a/gmp/mpn/arm64/mul_1.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-dnl ARM64 mpn_mul_1
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
-
-define(`rp', `x0')
-define(`up', `x1')
-define(`n', `x2')
-define(`v0', `x3')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- ldr x12, [up], #8
- and x6, n, #3
- and n, n, #-4
- cbz x6, L(fi0)
- cmp x6, #2
- bcc L(fi1)
- beq L(fi2)
-
-L(fi3): mul x8, x12, v0
- umulh x13, x12, v0
- cmn xzr, xzr
- b L(L3)
-L(fi2): mul x7, x12, v0
- umulh x5, x12, v0
- cmn xzr, xzr
- b L(L2)
-L(fi0): mul x9, x12, v0
- umulh x5, x12, v0
- sub n, n, #4
- cmn xzr, xzr
- b L(L0)
-L(fi1): mul x10, x12, v0
- umulh x13, x12, v0
- cmn xzr, xzr
- cbz n, L(end)
-
-L(top): sub n, n, #4
- ldr x12, [up], #8
- mul x6, x12, v0
- umulh x5, x12, v0
- str x10, [rp], #8
- adcs x9, x6, x13
-L(L0): ldr x12, [up], #8
- mul x6, x12, v0
- umulh x13, x12, v0
- str x9, [rp] ,#8
- adcs x8, x6, x5
-L(L3): ldr x12, [up], #8
- mul x6, x12, v0
- umulh x5, x12, v0
- str x8, [rp], #8
- adcs x7, x6, x13
-L(L2): ldr x12, [up], #8
- mul x6, x12, v0
- umulh x13, x12, v0
- str x7, [rp], #8
- adcs x10, x6, x5
- cbnz n, L(top)
-
-L(end): str x10, [rp]
- adc x0, x13, xzr
- ret
-EPILOGUE()
diff --git a/gmp/mpn/asm-defs.m4 b/gmp/mpn/asm-defs.m4
index e573cc4ca8..ee9626dd57 100644
--- a/gmp/mpn/asm-defs.m4
+++ b/gmp/mpn/asm-defs.m4
@@ -2,33 +2,23 @@ divert(-1)
dnl
dnl m4 macros for gmp assembly code, shared by all CPUs.
-dnl Copyright 1999-2006, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl These macros are designed for use with any m4 and have been used on
@@ -59,7 +49,7 @@ dnl
dnl But note that when a quoted string is being read, a # isn't special, so
dnl apostrophes in comments in quoted strings must be avoided or they'll be
dnl interpreted as a closing quote mark. But when the quoted text is
-dnl re-read # will still act like a normal comment, suppressing macro
+dnl re-read # will still act like a normal comment, supressing macro
dnl expansion.
dnl
dnl For example,
@@ -229,7 +219,7 @@ undefine(`m4_dollarhash_1_if_noparen_test')
dnl Usage: m4wrap_prepend(string)
dnl
-dnl Prepend the given string to what will be expanded under m4wrap at the
+dnl Prepend the given string to what will be exapanded under m4wrap at the
dnl end of input.
dnl
dnl This macro exists to work around variations in m4wrap() behaviour in
@@ -877,7 +867,7 @@ ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
dnl Called: deflit_emptyargcheck(macroname,$#,`$1')
define(deflit_emptyargcheck,
`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
-`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-defs.m4 for more information)
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information)
')')')
@@ -1064,18 +1054,6 @@ dnl aors_n
m4_not_for_expansion(`OPERATION_add_n')
m4_not_for_expansion(`OPERATION_sub_n')
-dnl aors_err1_n
-m4_not_for_expansion(`OPERATION_add_err1_n')
-m4_not_for_expansion(`OPERATION_sub_err1_n')
-
-dnl aors_err2_n
-m4_not_for_expansion(`OPERATION_add_err2_n')
-m4_not_for_expansion(`OPERATION_sub_err2_n')
-
-dnl aors_err3_n
-m4_not_for_expansion(`OPERATION_add_err3_n')
-m4_not_for_expansion(`OPERATION_sub_err3_n')
-
dnl aorsmul_1
m4_not_for_expansion(`OPERATION_addmul_1')
m4_not_for_expansion(`OPERATION_submul_1')
@@ -1101,12 +1079,6 @@ m4_not_for_expansion(`OPERATION_rshift')
dnl aorslsh1_n
m4_not_for_expansion(`OPERATION_addlsh1_n')
m4_not_for_expansion(`OPERATION_sublsh1_n')
-m4_not_for_expansion(`OPERATION_rsblsh1_n')
-
-dnl aorslsh2_n
-m4_not_for_expansion(`OPERATION_addlsh2_n')
-m4_not_for_expansion(`OPERATION_sublsh2_n')
-m4_not_for_expansion(`OPERATION_rsblsh2_n')
dnl rsh1aors_n
m4_not_for_expansion(`OPERATION_rsh1add_n')
@@ -1119,7 +1091,7 @@ dnl Check that `symbol' is defined. If it isn't, issue an error and
dnl terminate immediately. The error message explains that the symbol
dnl should be in config.m4, copied from gmp-mparam.h.
dnl
-dnl Termination is immediate since missing say SQR_TOOM2_THRESHOLD can
+dnl Termination is immediate since missing say SQR_KARATSUBA_THRESHOLD can
dnl lead to infinite loops and endless error messages.
define(m4_config_gmp_mparam,
@@ -1243,10 +1215,10 @@ dnl definitions. If COUNT is redefined, the LOG2, MASK and BYTES follow
dnl the new definition automatically.
dnl
dnl LOG2 is the log base 2 of COUNT. MASK is COUNT-1, which can be used as
-dnl a bit mask. BYTES is GMP_LIMB_BYTES*COUNT, the number of bytes
+dnl a bit mask. BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes
dnl processed in each unrolled loop.
dnl
-dnl GMP_LIMB_BYTES is defined in a CPU specific m4 include file. It
+dnl BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file. It
dnl exists only so the BYTES definitions here can be common to all CPUs.
dnl In the actual code for a given CPU, an explicit 4 or 8 may as well be
dnl used because the code is only for a particular CPU, it doesn't need to
@@ -1283,8 +1255,8 @@ m4_assert_defined(`UNROLL_COUNT')
deflit(UNROLL_BYTES,
m4_assert_defined(`UNROLL_COUNT')
-m4_assert_defined(`GMP_LIMB_BYTES')
-`eval(UNROLL_COUNT * GMP_LIMB_BYTES)')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)')
deflit(CHUNK_LOG2,
m4_assert_defined(`CHUNK_COUNT')
@@ -1296,8 +1268,8 @@ m4_assert_defined(`CHUNK_COUNT')
deflit(CHUNK_BYTES,
m4_assert_defined(`CHUNK_COUNT')
-m4_assert_defined(`GMP_LIMB_BYTES')
-`eval(CHUNK_COUNT * GMP_LIMB_BYTES)')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)')
dnl Usage: MPN(name)
@@ -1324,65 +1296,31 @@ dnl function that might be implemented in assembler is here.
define(define_mpn,
m4_assert_numargs(1)
-`deflit(`mpn_$1',`MPN(`$1')')')
+`define(`mpn_$1',`MPN(`$1')')')
define_mpn(add)
define_mpn(add_1)
-define_mpn(add_err1_n)
-define_mpn(add_err2_n)
-define_mpn(add_err3_n)
define_mpn(add_n)
define_mpn(add_nc)
define_mpn(addlsh1_n)
-define_mpn(addlsh1_nc)
-define_mpn(addlsh2_n)
-define_mpn(addlsh2_nc)
-define_mpn(addlsh_n)
-define_mpn(addlsh_nc)
-define_mpn(addlsh1_n_ip1)
-define_mpn(addlsh1_nc_ip1)
-define_mpn(addlsh2_n_ip1)
-define_mpn(addlsh2_nc_ip1)
-define_mpn(addlsh_n_ip1)
-define_mpn(addlsh_nc_ip1)
-define_mpn(addlsh1_n_ip2)
-define_mpn(addlsh1_nc_ip2)
-define_mpn(addlsh2_n_ip2)
-define_mpn(addlsh2_nc_ip2)
-define_mpn(addlsh_n_ip2)
-define_mpn(addlsh_nc_ip2)
define_mpn(addmul_1)
define_mpn(addmul_1c)
define_mpn(addmul_2)
define_mpn(addmul_3)
define_mpn(addmul_4)
-define_mpn(addmul_5)
-define_mpn(addmul_6)
-define_mpn(addmul_7)
-define_mpn(addmul_8)
-define_mpn(addmul_2s)
-define_mpn(add_n_sub_n)
-define_mpn(add_n_sub_nc)
+define_mpn(addsub_n)
+define_mpn(addsub_nc)
define_mpn(addaddmul_1msb0)
define_mpn(and_n)
define_mpn(andn_n)
-define_mpn(bdiv_q_1)
-define_mpn(pi1_bdiv_q_1)
define_mpn(bdiv_dbm1c)
+define_mpn(bdivmod)
define_mpn(cmp)
-define_mpn(cnd_add_n)
-define_mpn(cnd_sub_n)
-define_mpn(com)
+define_mpn(com_n)
define_mpn(copyd)
define_mpn(copyi)
define_mpn(count_leading_zeros)
define_mpn(count_trailing_zeros)
-define_mpn(div_qr_1n_pi1)
-define_mpn(div_qr_2)
-define_mpn(div_qr_2n_pi1)
-define_mpn(div_qr_2u_pi1)
-define_mpn(div_qr_2n_pi2)
-define_mpn(div_qr_2u_pi2)
define_mpn(divexact_1)
define_mpn(divexact_by3c)
define_mpn(divrem)
@@ -1398,19 +1336,16 @@ define_mpn(gcdext)
define_mpn(get_str)
define_mpn(hamdist)
define_mpn(invert_limb)
-define_mpn(invert_limb_table)
define_mpn(ior_n)
define_mpn(iorn_n)
+define_mpn(kara_mul_n)
+define_mpn(kara_sqr_n)
define_mpn(lshift)
define_mpn(lshiftc)
-define_mpn(mod_1_1p)
-define_mpn(mod_1_1p_cps)
-define_mpn(mod_1s_2p)
-define_mpn(mod_1s_2p_cps)
-define_mpn(mod_1s_3p)
-define_mpn(mod_1s_3p_cps)
-define_mpn(mod_1s_4p)
-define_mpn(mod_1s_4p_cps)
+define_mpn(mod_1_1)
+define_mpn(mod_1_2)
+define_mpn(mod_1_3)
+define_mpn(mod_1_4)
define_mpn(mod_1)
define_mpn(mod_1c)
define_mpn(mod_34lsub1)
@@ -1422,18 +1357,14 @@ define_mpn(mul_1c)
define_mpn(mul_2)
define_mpn(mul_3)
define_mpn(mul_4)
-define_mpn(mul_5)
-define_mpn(mul_6)
define_mpn(mul_basecase)
define_mpn(mul_n)
-define_mpn(mullo_basecase)
-define_mpn(mulmid_basecase)
define_mpn(perfect_square_p)
define_mpn(popcount)
define_mpn(preinv_divrem_1)
define_mpn(preinv_mod_1)
define_mpn(nand_n)
-define_mpn(neg)
+define_mpn(neg_n)
define_mpn(nior_n)
define_mpn(powm)
define_mpn(powlo)
@@ -1441,16 +1372,8 @@ define_mpn(random)
define_mpn(random2)
define_mpn(redc_1)
define_mpn(redc_2)
-define_mpn(rsblsh1_n)
-define_mpn(rsblsh1_nc)
-define_mpn(rsblsh2_n)
-define_mpn(rsblsh2_nc)
-define_mpn(rsblsh_n)
-define_mpn(rsblsh_nc)
define_mpn(rsh1add_n)
-define_mpn(rsh1add_nc)
define_mpn(rsh1sub_n)
-define_mpn(rsh1sub_nc)
define_mpn(rshift)
define_mpn(rshiftc)
define_mpn(scan0)
@@ -1458,31 +1381,17 @@ define_mpn(scan1)
define_mpn(set_str)
define_mpn(sqr_basecase)
define_mpn(sqr_diagonal)
-define_mpn(sqr_diag_addlsh1)
define_mpn(sub_n)
define_mpn(sublsh1_n)
-define_mpn(sublsh1_nc)
-define_mpn(sublsh1_n_ip1)
-define_mpn(sublsh1_nc_ip1)
-define_mpn(sublsh2_n)
-define_mpn(sublsh2_nc)
-define_mpn(sublsh2_n_ip1)
-define_mpn(sublsh2_nc_ip1)
-define_mpn(sublsh_n)
-define_mpn(sublsh_nc)
-define_mpn(sublsh_n_ip1)
-define_mpn(sublsh_nc_ip1)
define_mpn(sqrtrem)
define_mpn(sub)
define_mpn(sub_1)
-define_mpn(sub_err1_n)
-define_mpn(sub_err2_n)
-define_mpn(sub_err3_n)
define_mpn(sub_n)
define_mpn(sub_nc)
define_mpn(submul_1)
define_mpn(submul_1c)
-define_mpn(sec_tabselect)
+define_mpn(toom3_mul_n)
+define_mpn(toom3_sqr_n)
define_mpn(umul_ppmm)
define_mpn(umul_ppmm_r)
define_mpn(udiv_qrnnd)
@@ -1724,22 +1633,6 @@ m4_assert_numargs(1)
)
-dnl Usage: ABI_SUPPORT(abi)
-dnl
-dnl A dummy macro which is grepped for by ./configure to know what ABIs
-dnl are supported in an asm file.
-dnl
-dnl If multiple non-standard ABIs are supported, several ABI_SUPPORT
-dnl declarations should be used:
-dnl
-dnl ABI_SUPPORT(FOOABI)
-dnl ABI_SUPPORT(BARABI)
-
-define(ABI_SUPPORT,
-m4_assert_numargs(1)
-)
-
-
dnl Usage: GMP_NUMB_MASK
dnl
dnl A bit mask for the number part of a limb. Eg. with 6 bit nails in a
@@ -1751,11 +1644,4 @@ m4_assert_defined(`GMP_NUMB_BITS')
`m4_hex_lowmask(GMP_NUMB_BITS)')
-dnl Usage: m4append(`variable',`value-to-append')
-
-define(`m4append',
-`define(`$1', defn(`$1')`$2')
-'
-)
-
divert`'dnl
diff --git a/gmp/mpn/clipper/add_n.s b/gmp/mpn/clipper/add_n.s
new file mode 100644
index 0000000000..225b95042c
--- /dev/null
+++ b/gmp/mpn/clipper/add_n.s
@@ -0,0 +1,46 @@
+; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 16
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry-save register
+
+.Loop: loadw (r1),r4
+ loadw (r2),r5
+ addwc r6,r6 ; restore carry from r6
+ addwc r5,r4
+ storw r4,(r0)
+ subwc r6,r6 ; save carry in r6
+ addq $4,r0
+ addq $4,r1
+ addq $4,r2
+ subq $1,r3
+ brne .Loop
+
+ negw r6,r0
+ loadw (sp),r6
+ addq $8,sp
+ ret sp
diff --git a/gmp/mpn/clipper/mul_1.s b/gmp/mpn/clipper/mul_1.s
new file mode 100644
index 0000000000..058a317617
--- /dev/null
+++ b/gmp/mpn/clipper/mul_1.s
@@ -0,0 +1,45 @@
+; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 16
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry limb
+
+.Loop: loadw (r1),r4
+ mulwux r3,r4
+ addw r6,r4 ; add old carry limb into low product limb
+ loadq $0,r6
+ addwc r5,r6 ; propagate cy into high product limb
+ storw r4,(r0)
+ addq $4,r0
+ addq $4,r1
+ subq $1,r2
+ brne .Loop
+
+ movw r6,r0
+ loadw 0(sp),r6
+ addq $8,sp
+ ret sp
diff --git a/gmp/mpn/clipper/sub_n.s b/gmp/mpn/clipper/sub_n.s
new file mode 100644
index 0000000000..58c2cb3342
--- /dev/null
+++ b/gmp/mpn/clipper/sub_n.s
@@ -0,0 +1,46 @@
+; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 16
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ subq $8,sp
+ storw r6,(sp)
+ loadw 12(sp),r2
+ loadw 16(sp),r3
+ loadq $0,r6 ; clear carry-save register
+
+.Loop: loadw (r1),r4
+ loadw (r2),r5
+ addwc r6,r6 ; restore carry from r6
+ subwc r5,r4
+ storw r4,(r0)
+ subwc r6,r6 ; save carry in r6
+ addq $4,r0
+ addq $4,r1
+ addq $4,r2
+ subq $1,r3
+ brne .Loop
+
+ negw r6,r0
+ loadw (sp),r6
+ addq $8,sp
+ ret sp
diff --git a/gmp/mpn/cpp-ccas b/gmp/mpn/cpp-ccas
index 25f7cdcbeb..fd62f902d1 100755
--- a/gmp/mpn/cpp-ccas
+++ b/gmp/mpn/cpp-ccas
@@ -4,31 +4,20 @@
# Copyright 2001 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: cpp-cc --cpp=CPP CC ... file.S ...
diff --git a/gmp/mpn/cray/README b/gmp/mpn/cray/README
index 3a347d2805..ab3b032706 100644
--- a/gmp/mpn/cray/README
+++ b/gmp/mpn/cray/README
@@ -1,30 +1,19 @@
-Copyright 2000-2002 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -45,7 +34,7 @@ systems with cfp floating point, the main obstacle is the forming of
128-bit products. For IEEE systems, adding, and in particular
computing carry is the main issue. There are no vectorizing
unsigned-less-than instructions, and the sequence that implement that
-operation is very long.
+opetration is very long.
Shifting is the only operation that is simple to make fast. All Cray
systems have a bitblt instructions (Vi Vj,Vj<Ak and Vi Vj,Vj>Ak) that
@@ -118,4 +107,4 @@ down to 2.5 cycles/limb and mpn_addmul_1 times to 4 cycles/limb. By
storing even fewer bits per limb, perhaps 56, it would be possible to
write a mul_mul_basecase that would run at effectively 1 cycle/limb.
(Use VM here to better handle the romb-shaped multiply area, perhaps
-rounding operand sizes up to the next power of 2.)
+rouding operand sizes up to the next power of 2.)
diff --git a/gmp/mpn/cray/add_n.c b/gmp/mpn/cray/add_n.c
index 65b53bf87a..e4f8a0da9b 100644
--- a/gmp/mpn/cray/add_n.c
+++ b/gmp/mpn/cray/add_n.c
@@ -6,28 +6,17 @@ Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This code runs at 4 cycles/limb. It may be possible to bring it down
to 3 cycles/limb. */
diff --git a/gmp/mpn/cray/cfp/addmul_1.c b/gmp/mpn/cray/cfp/addmul_1.c
index e1d52e4a5f..c981b3d3a8 100644
--- a/gmp/mpn/cray/cfp/addmul_1.c
+++ b/gmp/mpn/cray/cfp/addmul_1.c
@@ -5,28 +5,17 @@ Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
diff --git a/gmp/mpn/cray/cfp/mul_1.c b/gmp/mpn/cray/cfp/mul_1.c
index 611a9d2532..5038e93fef 100644
--- a/gmp/mpn/cray/cfp/mul_1.c
+++ b/gmp/mpn/cray/cfp/mul_1.c
@@ -5,28 +5,17 @@ Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
diff --git a/gmp/mpn/cray/cfp/mulwwc90.s b/gmp/mpn/cray/cfp/mulwwc90.s
index 71d2285fd7..3234913c10 100644
--- a/gmp/mpn/cray/cfp/mulwwc90.s
+++ b/gmp/mpn/cray/cfp/mulwwc90.s
@@ -1,33 +1,22 @@
-* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
-* Copyright 1996, 2000 Free Software Foundation, Inc.
-* This file is generated from mulww.f in this same directory.
+* Copyright 1996, 2000 Free Software Foundation, Inc.
+* This file is generated from mulww.f in this same directory.
-* This file is part of the GNU MP Library.
-*
-* The GNU MP Library is free software; you can redistribute it and/or modify
-* it under the terms of either:
-*
-* * the GNU Lesser General Public License as published by the Free
-* Software Foundation; either version 3 of the License, or (at your
-* option) any later version.
-*
-* or
-*
-* * the GNU General Public License as published by the Free Software
-* Foundation; either version 2 of the License, or (at your option) any
-* later version.
-*
-* or both in parallel, as here.
-*
-* The GNU MP Library is distributed in the hope that it will be useful, but
-* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-* for more details.
-*
-* You should have received copies of the GNU General Public License and the
-* GNU Lesser General Public License along with the GNU MP Library. If not,
-* see https://www.gnu.org/licenses/.
+* This file is part of the GNU MP Library.
+
+* The GNU MP Library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public License as
+* published by the Free Software Foundation; either version 3 of the
+* License, or (at your option) any later version.
+
+* The GNU MP Library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+
+* You should have received a copy of the GNU Lesser General Public License
+* along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
IDENT GMPN_MULWW
**********************************************
diff --git a/gmp/mpn/cray/cfp/mulwwj90.s b/gmp/mpn/cray/cfp/mulwwj90.s
index 1c2c7cddbe..94d391c2f9 100644
--- a/gmp/mpn/cray/cfp/mulwwj90.s
+++ b/gmp/mpn/cray/cfp/mulwwj90.s
@@ -1,33 +1,22 @@
-* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
-* Copyright 1996, 2000 Free Software Foundation, Inc.
-* This file is generated from mulww.f in this same directory.
+* Copyright 1996, 2000 Free Software Foundation, Inc.
+* This file is generated from mulww.f in this same directory.
-* This file is part of the GNU MP Library.
-*
-* The GNU MP Library is free software; you can redistribute it and/or modify
-* it under the terms of either:
-*
-* * the GNU Lesser General Public License as published by the Free
-* Software Foundation; either version 3 of the License, or (at your
-* option) any later version.
-*
-* or
-*
-* * the GNU General Public License as published by the Free Software
-* Foundation; either version 2 of the License, or (at your option) any
-* later version.
-*
-* or both in parallel, as here.
-*
-* The GNU MP Library is distributed in the hope that it will be useful, but
-* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-* for more details.
-*
-* You should have received copies of the GNU General Public License and the
-* GNU Lesser General Public License along with the GNU MP Library. If not,
-* see https://www.gnu.org/licenses/.
+* This file is part of the GNU MP Library.
+
+* The GNU MP Library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public License as
+* published by the Free Software Foundation; either version 3 of the
+* License, or (at your option) any later version.
+
+* The GNU MP Library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+
+* You should have received a copy of the GNU Lesser General Public License
+* along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
IDENT GMPN_MULWW
**********************************************
diff --git a/gmp/mpn/cray/cfp/submul_1.c b/gmp/mpn/cray/cfp/submul_1.c
index b44c97df45..0507d0ef2e 100644
--- a/gmp/mpn/cray/cfp/submul_1.c
+++ b/gmp/mpn/cray/cfp/submul_1.c
@@ -5,28 +5,17 @@ Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
diff --git a/gmp/mpn/cray/gmp-mparam.h b/gmp/mpn/cray/gmp-mparam.h
index ea8c25b32e..72dcb627da 100644
--- a/gmp/mpn/cray/gmp-mparam.h
+++ b/gmp/mpn/cray/gmp-mparam.h
@@ -1,35 +1,25 @@
/* Cray T90 CFP gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1996, 2000-2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1996, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
#if 0
#define UMUL_TIME 519
@@ -40,11 +30,11 @@ see https://www.gnu.org/licenses/. */
/* Generated by tuneup.c, 2004-02-07, system compiler */
-#define MUL_TOOM22_THRESHOLD 71
-#define MUL_TOOM33_THRESHOLD 131
+#define MUL_KARATSUBA_THRESHOLD 71
+#define MUL_TOOM3_THRESHOLD 131
#define SQR_BASECASE_THRESHOLD 32
-#define SQR_TOOM2_THRESHOLD 199
+#define SQR_KARATSUBA_THRESHOLD 199
#define SQR_TOOM3_THRESHOLD 363
#define DIV_SB_PREINV_THRESHOLD 0 /* (preinv always) */
diff --git a/gmp/mpn/cray/hamdist.c b/gmp/mpn/cray/hamdist.c
index 8eb9ba018c..d80b4d6324 100644
--- a/gmp/mpn/cray/hamdist.c
+++ b/gmp/mpn/cray/hamdist.c
@@ -5,28 +5,17 @@ Copyright 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <intrinsics.h>
#include "gmp.h"
diff --git a/gmp/mpn/cray/ieee/addmul_1.c b/gmp/mpn/cray/ieee/addmul_1.c
index 6318b7c9c2..158a79cba8 100644
--- a/gmp/mpn/cray/ieee/addmul_1.c
+++ b/gmp/mpn/cray/ieee/addmul_1.c
@@ -1,33 +1,22 @@
/* Cray PVP/IEEE mpn_addmul_1 -- multiply a limb vector with a limb and add the
result to a second limb vector.
-Copyright 2000-2002 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This code runs at just under 9 cycles/limb on a T90. That is not perfect,
mainly due to vector register shortage in the main loop. Assembly code
diff --git a/gmp/mpn/cray/ieee/gmp-mparam.h b/gmp/mpn/cray/ieee/gmp-mparam.h
index 1fdc286574..03d655c814 100644
--- a/gmp/mpn/cray/ieee/gmp-mparam.h
+++ b/gmp/mpn/cray/ieee/gmp-mparam.h
@@ -1,44 +1,33 @@
/* Cray T90 IEEE gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1996, 2000-2002, 2004 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1996, 2000, 2001, 2002, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* Generated by tuneup.c, 2004-02-07, system compiler */
-#define MUL_TOOM22_THRESHOLD 130
-#define MUL_TOOM33_THRESHOLD 260
+#define MUL_KARATSUBA_THRESHOLD 130
+#define MUL_TOOM3_THRESHOLD 260
#define SQR_BASECASE_THRESHOLD 9 /* karatsuba */
-#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */
+#define SQR_KARATSUBA_THRESHOLD 0 /* never sqr_basecase */
#define SQR_TOOM3_THRESHOLD 34
#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
diff --git a/gmp/mpn/cray/ieee/invert_limb.c b/gmp/mpn/cray/ieee/invert_limb.c
index f951a6e138..e3484a9153 100644
--- a/gmp/mpn/cray/ieee/invert_limb.c
+++ b/gmp/mpn/cray/ieee/invert_limb.c
@@ -5,28 +5,17 @@ Copyright 1991, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -84,14 +73,14 @@ mpn_invert_limb (mp_limb_t d)
mp_limb_t xh, xl;
mp_limb_t zh, zl;
-#if GMP_LIMB_BITS == 32
+#if BITS_PER_MP_LIMB == 32
z = approx_tab[(d >> 23) - 0x100] << 6; /* z < 2^16 */
z2l = z * z; /* z2l < 2^32 */
umul_ppmm (th, tl, z2l, d);
z = (z << 17) - (th << 1);
#endif
-#if GMP_LIMB_BITS == 64
+#if BITS_PER_MP_LIMB == 64
z = approx_tab[(d >> 55) - 0x100] << 6; /* z < 2^16 */
z2l = z * z; /* z2l < 2^32 */
@@ -108,7 +97,7 @@ mpn_invert_limb (mp_limb_t d)
umul_ppmm (xh, xl, z2l, d);
tl += xh;
th += tl < xh;
- th = (th << 2) | (tl >> GMP_LIMB_BITS - 2);
+ th = (th << 2) | (tl >> BITS_PER_MP_LIMB - 2);
tl = tl << 2;
sub_ddmmss (zh, zl, z << 2, 0, th, tl);
diff --git a/gmp/mpn/cray/ieee/mul_1.c b/gmp/mpn/cray/ieee/mul_1.c
index dad09fa8cf..4dc2fd9dec 100644
--- a/gmp/mpn/cray/ieee/mul_1.c
+++ b/gmp/mpn/cray/ieee/mul_1.c
@@ -6,28 +6,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This code runs at 5 cycles/limb on a T90. That would probably
be hard to improve upon, even with assembly code. */
diff --git a/gmp/mpn/cray/ieee/mul_basecase.c b/gmp/mpn/cray/ieee/mul_basecase.c
index 6dc845dd99..ea32db312e 100644
--- a/gmp/mpn/cray/ieee/mul_basecase.c
+++ b/gmp/mpn/cray/ieee/mul_basecase.c
@@ -5,28 +5,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* The most critical loop of this code runs at about 5 cycles/limb on a T90.
That is not perfect, mainly due to vector register shortage. */
diff --git a/gmp/mpn/cray/ieee/sqr_basecase.c b/gmp/mpn/cray/ieee/sqr_basecase.c
index 840d3dd260..92a9a0e14f 100644
--- a/gmp/mpn/cray/ieee/sqr_basecase.c
+++ b/gmp/mpn/cray/ieee/sqr_basecase.c
@@ -5,28 +5,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This is just mpn_mul_basecase with trivial modifications. */
diff --git a/gmp/mpn/cray/ieee/submul_1.c b/gmp/mpn/cray/ieee/submul_1.c
index 27a1939019..4d7a6b47cd 100644
--- a/gmp/mpn/cray/ieee/submul_1.c
+++ b/gmp/mpn/cray/ieee/submul_1.c
@@ -1,33 +1,22 @@
/* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and
subtract the result from a second limb vector.
-Copyright 2000-2002 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This code runs at just under 9 cycles/limb on a T90. That is not perfect,
mainly due to vector register shortage in the main loop. Assembly code
diff --git a/gmp/mpn/cray/lshift.c b/gmp/mpn/cray/lshift.c
index 074f38041a..64302e9d39 100644
--- a/gmp/mpn/cray/lshift.c
+++ b/gmp/mpn/cray/lshift.c
@@ -5,28 +5,17 @@ Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <intrinsics.h>
#include "gmp.h"
@@ -40,7 +29,7 @@ mpn_lshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt)
mp_limb_t retval;
sh_1 = cnt;
- sh_2 = GMP_LIMB_BITS - sh_1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
retval = up[n - 1] >> sh_2;
#pragma _CRI ivdep
diff --git a/gmp/mpn/cray/mulww.f b/gmp/mpn/cray/mulww.f
index 9bddf05bc9..e0bf96e441 100644
--- a/gmp/mpn/cray/mulww.f
+++ b/gmp/mpn/cray/mulww.f
@@ -1,36 +1,25 @@
-c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
-
-c Copyright 1996, 2000 Free Software Foundation, Inc.
-
-c This file is part of the GNU MP Library.
-c
-c The GNU MP Library is free software; you can redistribute it and/or modify
-c it under the terms of either:
-c
-c * the GNU Lesser General Public License as published by the Free
-c Software Foundation; either version 3 of the License, or (at your
-c option) any later version.
-c
-c or
-c
-c * the GNU General Public License as published by the Free Software
-c Foundation; either version 2 of the License, or (at your option) any
-c later version.
-c
-c or both in parallel, as here.
-c
-c The GNU MP Library is distributed in the hope that it will be useful, but
-c WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-c or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-c for more details.
-c
-c You should have received copies of the GNU General Public License and the
-c GNU Lesser General Public License along with the GNU MP Library. If not,
-c see https://www.gnu.org/licenses/.
-
-c p1[] = hi(a[]*s); the upper limbs of each product
-c p0[] = low(a[]*s); the corresponding lower limbs
-c n is number of limbs in the vectors
+c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c Copyright 1996, 2000 Free Software Foundation, Inc.
+
+c This file is part of the GNU MP Library.
+
+c The GNU MP Library is free software; you can redistribute it and/or
+c modify it under the terms of the GNU Lesser General Public License as
+c published by the Free Software Foundation; either version 3 of the
+c License, or (at your option) any later version.
+
+c The GNU MP Library is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+c Lesser General Public License for more details.
+
+c You should have received a copy of the GNU Lesser General Public License
+c along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+c p1[] = hi(a[]*s); the upper limbs of each product
+c p0[] = low(a[]*s); the corresponding lower limbs
+c n is number of limbs in the vectors
subroutine gmpn_mulww(p1,p0,a,n,s)
integer*8 p1(0:*),p0(0:*),a(0:*),s
diff --git a/gmp/mpn/cray/popcount.c b/gmp/mpn/cray/popcount.c
index 48ddab875e..3abdce85cc 100644
--- a/gmp/mpn/cray/popcount.c
+++ b/gmp/mpn/cray/popcount.c
@@ -5,28 +5,17 @@ Copyright 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <intrinsics.h>
#include "gmp.h"
diff --git a/gmp/mpn/cray/rshift.c b/gmp/mpn/cray/rshift.c
index 424bede9db..6280d2ca7e 100644
--- a/gmp/mpn/cray/rshift.c
+++ b/gmp/mpn/cray/rshift.c
@@ -5,28 +5,17 @@ Copyright (C) 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <intrinsics.h>
#include "gmp.h"
@@ -40,7 +29,7 @@ mpn_rshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt)
mp_limb_t retval;
sh_1 = cnt;
- sh_2 = GMP_LIMB_BITS - sh_1;
+ sh_2 = BITS_PER_MP_LIMB - sh_1;
retval = up[0] << sh_2;
#pragma _CRI ivdep
diff --git a/gmp/mpn/cray/sub_n.c b/gmp/mpn/cray/sub_n.c
index 0cc9ad1e04..90a5f1b1e8 100644
--- a/gmp/mpn/cray/sub_n.c
+++ b/gmp/mpn/cray/sub_n.c
@@ -6,28 +6,17 @@ Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* This code runs at 4 cycles/limb. It may be possible to bring it down
to 3 cycles/limb. */
diff --git a/gmp/mpn/generic/add.c b/gmp/mpn/generic/add.c
index 559f26133c..8065ccf3c2 100644
--- a/gmp/mpn/generic/add.c
+++ b/gmp/mpn/generic/add.c
@@ -5,28 +5,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define __GMP_FORCE_mpn_add 1
diff --git a/gmp/mpn/generic/add_1.c b/gmp/mpn/generic/add_1.c
index ca2d866852..2d3fa76c2e 100644
--- a/gmp/mpn/generic/add_1.c
+++ b/gmp/mpn/generic/add_1.c
@@ -5,28 +5,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define __GMP_FORCE_mpn_add_1 1
diff --git a/gmp/mpn/generic/add_err1_n.c b/gmp/mpn/generic/add_err1_n.c
deleted file mode 100644
index b8cb75f6e8..0000000000
--- a/gmp/mpn/generic/add_err1_n.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* mpn_add_err1_n -- add_n with one error term
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
- return value is carry out.
-
- (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
- Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_add_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n));
-
- yp += n - 1;
- el = eh = 0;
-
- do
- {
- yl = *yp--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary add_n */
- ADDC_LIMB (cy1, sl, ul, vl);
- ADDC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh:el) */
- zl = (-cy) & yl;
- el += zl;
- eh += el < zl;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS);
- el &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el;
- ep[1] = eh;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/add_err2_n.c b/gmp/mpn/generic/add_err2_n.c
deleted file mode 100644
index 4b0242a32d..0000000000
--- a/gmp/mpn/generic/add_err2_n.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/* mpn_add_err2_n -- add_n with two error terms
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
- return value is carry out.
-
- (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
- Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
- c[1]*yp2[n-1] + ... + c[n]*yp2[0],
- stores two-limb results at {ep,2} and {ep+2,2} respectively.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_add_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n));
-
- yp1 += n - 1;
- yp2 += n - 1;
- el1 = eh1 = 0;
- el2 = eh2 = 0;
-
- do
- {
- yl1 = *yp1--;
- yl2 = *yp2--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary add_n */
- ADDC_LIMB (cy1, sl, ul, vl);
- ADDC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh1:el1) */
- zl1 = (-cy) & yl1;
- el1 += zl1;
- eh1 += el1 < zl1;
-
- /* update (eh2:el2) */
- zl2 = (-cy) & yl2;
- el2 += zl2;
- eh2 += el2 < zl2;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
- el1 &= GMP_NUMB_MASK;
- eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
- el2 &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el1;
- ep[1] = eh1;
- ep[2] = el2;
- ep[3] = eh2;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/add_err3_n.c b/gmp/mpn/generic/add_err3_n.c
deleted file mode 100644
index 28cd7facf9..0000000000
--- a/gmp/mpn/generic/add_err3_n.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/* mpn_add_err3_n -- add_n with three error terms
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
- return value is carry out.
-
- (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
- Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
- c[1]*yp2[n-1] + ... + c[n]*yp2[0],
- c[1]*yp3[n-1] + ... + c[n]*yp3[0],
- stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_add_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n));
-
- yp1 += n - 1;
- yp2 += n - 1;
- yp3 += n - 1;
- el1 = eh1 = 0;
- el2 = eh2 = 0;
- el3 = eh3 = 0;
-
- do
- {
- yl1 = *yp1--;
- yl2 = *yp2--;
- yl3 = *yp3--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary add_n */
- ADDC_LIMB (cy1, sl, ul, vl);
- ADDC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh1:el1) */
- zl1 = (-cy) & yl1;
- el1 += zl1;
- eh1 += el1 < zl1;
-
- /* update (eh2:el2) */
- zl2 = (-cy) & yl2;
- el2 += zl2;
- eh2 += el2 < zl2;
-
- /* update (eh3:el3) */
- zl3 = (-cy) & yl3;
- el3 += zl3;
- eh3 += el3 < zl3;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
- el1 &= GMP_NUMB_MASK;
- eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
- el2 &= GMP_NUMB_MASK;
- eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS);
- el3 &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el1;
- ep[1] = eh1;
- ep[2] = el2;
- ep[3] = eh2;
- ep[4] = el3;
- ep[5] = eh3;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/add_n.c b/gmp/mpn/generic/add_n.c
index 1a07670900..5006e27780 100644
--- a/gmp/mpn/generic/add_n.c
+++ b/gmp/mpn/generic/add_n.c
@@ -1,32 +1,21 @@
/* mpn_add_n -- Add equal length limb vectors.
-Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc.
+Copyright 1992, 1993, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -40,8 +29,8 @@ mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
mp_limb_t ul, vl, sl, rl, cy, cy1, cy2;
ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
- ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
cy = 0;
do
@@ -70,8 +59,8 @@ mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
mp_limb_t ul, vl, rl, cy;
ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
- ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
cy = 0;
do
diff --git a/gmp/mpn/generic/addmul_1.c b/gmp/mpn/generic/addmul_1.c
index d76b4ad135..861e1bc830 100644
--- a/gmp/mpn/generic/addmul_1.c
+++ b/gmp/mpn/generic/addmul_1.c
@@ -3,33 +3,23 @@
pointed to by RP. Return the most significant limb of the product,
adjusted for carry-out from the addition.
-Copyright 1992-1994, 1996, 2000, 2002, 2004 Free Software Foundation, Inc.
+Copyright 1992, 1993, 1994, 1996, 2000, 2002, 2004 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/add_n_sub_n.c b/gmp/mpn/generic/addsub_n.c
index 012eb3e33a..452cf7b211 100644
--- a/gmp/mpn/generic/add_n_sub_n.c
+++ b/gmp/mpn/generic/addsub_n.c
@@ -1,36 +1,25 @@
-/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 1999-2001, 2006 Free Software Foundation, Inc.
+Copyright 1999, 2000, 2001, 2006 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -39,16 +28,16 @@ see https://www.gnu.org/licenses/. */
#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */
#endif
-#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6)
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
-/* mpn_add_n_sub_n.
+/* mpn_addsub_n.
r1[] = s1[] + s2[]
r2[] = s1[] - s2[]
All operands have n limbs.
In-place operations allowed. */
mp_limb_t
-mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
{
mp_limb_t acyn, acyo; /* carry for add */
mp_limb_t scyn, scyo; /* carry for subtract */
@@ -153,19 +142,19 @@ main (int argc, char **argv)
n = strtol (argv[1], 0, 0);
- r1p = malloc (n * GMP_LIMB_BYTES);
- r2p = malloc (n * GMP_LIMB_BYTES);
- s1p = malloc (n * GMP_LIMB_BYTES);
- s2p = malloc (n * GMP_LIMB_BYTES);
+ r1p = malloc (n * BYTES_PER_MP_LIMB);
+ r2p = malloc (n * BYTES_PER_MP_LIMB);
+ s1p = malloc (n * BYTES_PER_MP_LIMB);
+ s2p = malloc (n * BYTES_PER_MP_LIMB);
TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
printf (" separate add and sub: %.3f\n", t);
- TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n));
+ TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
printf ("combined addsub separate variables: %.3f\n", t);
- TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
printf (" combined addsub r1 overlap: %.3f\n", t);
- TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
printf (" combined addsub r2 overlap: %.3f\n", t);
- TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n));
+ TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
printf (" combined addsub in-place: %.3f\n", t);
return 0;
diff --git a/gmp/mpn/generic/bdiv_dbm1c.c b/gmp/mpn/generic/bdiv_dbm1c.c
index 22c3cfd2c8..23cb6f1c9e 100644
--- a/gmp/mpn/generic/bdiv_dbm1c.c
+++ b/gmp/mpn/generic/bdiv_dbm1c.c
@@ -10,28 +10,17 @@ Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/bdiv_q.c b/gmp/mpn/generic/bdiv_q.c
deleted file mode 100644
index 1fc1bb7c09..0000000000
--- a/gmp/mpn/generic/bdiv_q.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/* mpn_bdiv_q -- Hensel division with precomputed inverse, returning quotient.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-/* Computes Q = N / D mod B^n. */
-
-void
-mpn_bdiv_q (mp_ptr qp,
- mp_srcptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_ptr tp)
-{
- mp_limb_t di;
-
- if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD))
- {
- MPN_COPY (tp, np, nn);
- binvert_limb (di, dp[0]); di = -di;
- mpn_sbpi1_bdiv_q (qp, tp, nn, dp, dn, di);
- }
- else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
- {
- MPN_COPY (tp, np, nn);
- binvert_limb (di, dp[0]); di = -di;
- mpn_dcpi1_bdiv_q (qp, tp, nn, dp, dn, di);
- }
- else
- {
- mpn_mu_bdiv_q (qp, np, nn, dp, dn, tp);
- }
- return;
-}
-
-mp_size_t
-mpn_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
-{
- if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
- return nn;
- else
- return mpn_mu_bdiv_q_itch (nn, dn);
-}
diff --git a/gmp/mpn/generic/bdiv_q_1.c b/gmp/mpn/generic/bdiv_q_1.c
deleted file mode 100644
index 74b247d5a9..0000000000
--- a/gmp/mpn/generic/bdiv_q_1.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/* mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 1-limb
- divisor, returning quotient only.
-
- THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
- CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
- FUTURE GNU MP RELEASES.
-
-Copyright 2000-2003, 2005, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-mp_limb_t
-mpn_pi1_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d,
- mp_limb_t di, int shift)
-{
- mp_size_t i;
- mp_limb_t c, h, l, u, u_next, dummy;
-
- ASSERT (n >= 1);
- ASSERT (d != 0);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT_MPN (up, n);
- ASSERT_LIMB (d);
-
- d <<= GMP_NAIL_BITS;
-
- if (shift != 0)
- {
- c = 0;
-
- u = up[0];
- rp--;
- for (i = 1; i < n; i++)
- {
- u_next = up[i];
- u = ((u >> shift) | (u_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK;
-
- SUBC_LIMB (c, l, u, c);
-
- l = (l * di) & GMP_NUMB_MASK;
- rp[i] = l;
-
- umul_ppmm (h, dummy, l, d);
- c += h;
- u = u_next;
- }
-
- u = u >> shift;
- l = u - c;
- l = (l * di) & GMP_NUMB_MASK;
- rp[i] = l;
- }
- else
- {
- u = up[0];
- l = (u * di) & GMP_NUMB_MASK;
- rp[0] = l;
- c = 0;
-
- for (i = 1; i < n; i++)
- {
- umul_ppmm (h, dummy, l, d);
- c += h;
-
- u = up[i];
- SUBC_LIMB (c, l, u, c);
-
- l = (l * di) & GMP_NUMB_MASK;
- rp[i] = l;
- }
- }
-
- return c;
-}
-
-mp_limb_t
-mpn_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d)
-{
- mp_limb_t di;
- int shift;
-
- ASSERT (n >= 1);
- ASSERT (d != 0);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT_MPN (up, n);
- ASSERT_LIMB (d);
-
- if ((d & 1) == 0)
- {
- count_trailing_zeros (shift, d);
- d >>= shift;
- }
- else
- shift = 0;
-
- binvert_limb (di, d);
- return mpn_pi1_bdiv_q_1 (rp, up, n, d, di, shift);
-}
diff --git a/gmp/mpn/generic/bdiv_qr.c b/gmp/mpn/generic/bdiv_qr.c
deleted file mode 100644
index 6a5eedbbc2..0000000000
--- a/gmp/mpn/generic/bdiv_qr.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* mpn_bdiv_qr -- Hensel division with precomputed inverse, returning quotient
- and remainder.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-/* Computes Q = N / D mod B^n,
- R = N - QD. */
-
-mp_limb_t
-mpn_bdiv_qr (mp_ptr qp, mp_ptr rp,
- mp_srcptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_ptr tp)
-{
- mp_limb_t di;
- mp_limb_t rh;
-
- ASSERT (nn > dn);
- if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) ||
- BELOW_THRESHOLD (nn - dn, DC_BDIV_QR_THRESHOLD))
- {
- MPN_COPY (tp, np, nn);
- binvert_limb (di, dp[0]); di = -di;
- rh = mpn_sbpi1_bdiv_qr (qp, tp, nn, dp, dn, di);
- MPN_COPY (rp, tp + nn - dn, dn);
- }
- else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
- {
- MPN_COPY (tp, np, nn);
- binvert_limb (di, dp[0]); di = -di;
- rh = mpn_dcpi1_bdiv_qr (qp, tp, nn, dp, dn, di);
- MPN_COPY (rp, tp + nn - dn, dn);
- }
- else
- {
- rh = mpn_mu_bdiv_qr (qp, rp, np, nn, dp, dn, tp);
- }
-
- return rh;
-}
-
-mp_size_t
-mpn_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
-{
- if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
- return nn;
- else
- return mpn_mu_bdiv_qr_itch (nn, dn);
-}
diff --git a/gmp/mpn/generic/bdivmod.c b/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..783b594082
--- /dev/null
+++ b/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,124 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright 1991, 1993, 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+ Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+ returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+ Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up. Since the
+ low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+ the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+ Preconditions:
+ 1. V is odd.
+ 2. usize * BITS_PER_MP_LIMB >= d.
+ 3. If Q and U overlap, qp <= up.
+
+ Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+ Funding for this work has been partially provided by Conselho Nacional
+ de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+ 301314194-2, and was done while I was a visiting reseacher in the Instituto
+ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+ References:
+ T. Jebelean, An algorithm for exact division, Journal of Symbolic
+ Computation, v. 15, 1993, pp. 169-180.
+
+ K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+ Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+mp_limb_t
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+ mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+{
+ mp_limb_t v_inv;
+
+ ASSERT (usize >= 1);
+ ASSERT (vsize >= 1);
+ ASSERT (usize * GMP_NUMB_BITS >= d);
+ ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
+ ASSERT (! MPN_OVERLAP_P (qp, d/GMP_NUMB_BITS, vp, vsize));
+ ASSERT (MPN_SAME_OR_INCR2_P (qp, d/GMP_NUMB_BITS, up, usize));
+ ASSERT_MPN (up, usize);
+ ASSERT_MPN (vp, vsize);
+
+ /* 1/V mod 2^GMP_NUMB_BITS. */
+ binvert_limb (v_inv, vp[0]);
+
+ /* Fast code for two cases previously used by the accel part of mpn_gcd.
+ (Could probably remove this now it's inlined there.) */
+ if (usize == 2 && vsize == 2 &&
+ (d == GMP_NUMB_BITS || d == 2*GMP_NUMB_BITS))
+ {
+ mp_limb_t hi, lo;
+ mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
+ umul_ppmm (hi, lo, q, vp[0] << GMP_NAIL_BITS);
+ up[0] = 0;
+ up[1] -= hi + q*vp[1];
+ qp[0] = q;
+ if (d == 2*GMP_NUMB_BITS)
+ {
+ q = (up[1] * v_inv) & GMP_NUMB_MASK;
+ up[1] = 0;
+ qp[1] = q;
+ }
+ return 0;
+ }
+
+ /* Main loop. */
+ while (d >= GMP_NUMB_BITS)
+ {
+ mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
+ mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ d -= GMP_NUMB_BITS;
+ up += 1, usize -= 1;
+ *qp++ = q;
+ }
+
+ if (d)
+ {
+ mp_limb_t b;
+ mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+ if (q <= 1)
+ {
+ if (q == 0)
+ return 0;
+ else
+ b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+ }
+ else
+ b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+ if (usize > vsize)
+ mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+ return q;
+ }
+
+ return 0;
+}
diff --git a/gmp/mpn/generic/binvert.c b/gmp/mpn/generic/binvert.c
index be27ea552e..24d4dcdb6f 100644
--- a/gmp/mpn/generic/binvert.c
+++ b/gmp/mpn/generic/binvert.c
@@ -1,38 +1,28 @@
-/* Compute {up,n}^(-1) mod B^n.
+/* Compute {up,n}^(-1) mod 2(n*GMP_NUMB_BITS).
Contributed to the GNU project by Torbjorn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright (C) 2004-2007, 2009, 2012 Free Software Foundation, Inc.
+Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -43,6 +33,14 @@ see https://www.gnu.org/licenses/. */
r[k+1] = r[k] + r[k] - r[k]*(u*r[k])
*/
+/* This is intended for constant THRESHOLDs only, where the compiler can
+ completely fold the result. */
+#define LOG2C(n) \
+ (((n) >= 0x1) + ((n) >= 0x2) + ((n) >= 0x4) + ((n) >= 0x8) + \
+ ((n) >= 0x10) + ((n) >= 0x20) + ((n) >= 0x40) + ((n) >= 0x80) + \
+ ((n) >= 0x100) + ((n) >= 0x200) + ((n) >= 0x400) + ((n) >= 0x800) + \
+ ((n) >= 0x1000) + ((n) >= 0x2000) + ((n) >= 0x4000) + ((n) >= 0x8000))
+
#if TUNE_PROGRAM_BUILD
#define NPOWS \
((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
@@ -54,9 +52,12 @@ see https://www.gnu.org/licenses/. */
mp_size_t
mpn_binvert_itch (mp_size_t n)
{
- mp_size_t itch_local = mpn_mulmod_bnm1_next_size (n);
- mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, n, (n + 1) >> 1);
- return itch_local + itch_out;
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (n, 2 * MUL_FFT_MODF_THRESHOLD))
+ return mpn_fft_next_size (n, mpn_fft_best_k (n, 0));
+ else
+#endif
+ return 3 * (n - (n >> 1));
}
void
@@ -75,28 +76,42 @@ mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
xp = scratch;
- /* Compute a base value of rn limbs. */
+ /* Compute a base value using a low-overhead O(n^2) algorithm. FIXME: We
+ should call some divide-and-conquer lsb division function here for an
+ operand subrange. */
MPN_ZERO (xp, rn);
xp[0] = 1;
binvert_limb (di, up[0]);
if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
- mpn_sbpi1_bdiv_q (rp, xp, rn, up, rn, -di);
+ mpn_sb_bdiv_q (rp, xp, rn, up, rn, -di);
else
- mpn_dcpi1_bdiv_q (rp, xp, rn, up, rn, -di);
+ mpn_dc_bdiv_q (rp, xp, rn, up, rn, -di);
/* Use Newton iterations to get the desired precision. */
for (; rn < n; rn = newrn)
{
- mp_size_t m;
newrn = *--sizp;
- /* X <- UR. */
- m = mpn_mulmod_bnm1_next_size (newrn);
- mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m);
- mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1);
-
- /* R = R(X/B^rn) */
- mpn_mullo_n (rp + rn, rp, xp + rn, newrn - rn);
- mpn_neg (rp + rn, rp + rn, newrn - rn);
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (newrn, 2 * MUL_FFT_MODF_THRESHOLD))
+ {
+ int k;
+ mp_size_t m, i;
+
+ k = mpn_fft_best_k (newrn, 0);
+ m = mpn_fft_next_size (newrn, k);
+ mpn_mul_fft (xp, m, up, newrn, rp, rn, k);
+ for (i = rn - 1; i >= 0; i--)
+ if (xp[i] > (i == 0))
+ {
+ mpn_add_1 (xp + rn, xp + rn, newrn - rn, 1);
+ break;
+ }
+ }
+ else
+#endif
+ mpn_mul (xp, up, newrn, rp, rn);
+ mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn);
+ mpn_neg_n (rp + rn, rp + rn, newrn - rn);
}
}
diff --git a/gmp/mpn/generic/broot.c b/gmp/mpn/generic/broot.c
deleted file mode 100644
index 6974ac8b9e..0000000000
--- a/gmp/mpn/generic/broot.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/* mpn_broot -- Compute hensel sqrt
-
- Contributed to the GNU project by Niels Möller
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Computes a^e (mod B). Uses right-to-left binary algorithm, since
- typical use will have e small. */
-static mp_limb_t
-powlimb (mp_limb_t a, mp_limb_t e)
-{
- mp_limb_t r = 1;
- mp_limb_t s = a;
-
- for (r = 1, s = a; e > 0; e >>= 1, s *= s)
- if (e & 1)
- r *= s;
-
- return r;
-}
-
-/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd.
-
- Iterates
-
- r' <-- r - r * (a^{k-1} r^k - 1) / n
-
- If
-
- a^{k-1} r^k = 1 (mod 2^m),
-
- then
-
- a^{k-1} r'^k = 1 (mod 2^{2m}),
-
- Compute the update term as
-
- r' = r - (a^{k-1} r^{k+1} - r) / k
-
- where we still have cancellation of low limbs.
-
- */
-void
-mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
-{
- mp_size_t sizes[GMP_LIMB_BITS * 2];
- mp_ptr akm1, tp, rnp, ep;
- mp_limb_t a0, r0, km1, kp1h, kinv;
- mp_size_t rn;
- unsigned i;
-
- TMP_DECL;
-
- ASSERT (n > 0);
- ASSERT (ap[0] & 1);
- ASSERT (k & 1);
- ASSERT (k >= 3);
-
- TMP_MARK;
-
- akm1 = TMP_ALLOC_LIMBS (4*n);
- tp = akm1 + n;
-
- km1 = k-1;
- /* FIXME: Could arrange the iteration so we don't need to compute
- this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note
- that we can use wraparound also for a*r, since the low half is
- unchanged from the previous iteration. Or possibly mulmid. Also,
- a r = a^{1/k}, so we get that value too, for free? */
- mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */
-
- a0 = ap[0];
- binvert_limb (kinv, k);
-
- /* 4 bits: a^{1/k - 1} (mod 16):
-
- a % 8
- 1 3 5 7
- k%4 +-------
- 1 |1 1 1 1
- 3 |1 9 9 1
- */
- r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8);
- r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */
- r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */
- r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */
-#if GMP_NUMB_BITS > 32
- {
- unsigned prec = 32;
- do
- {
- r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k));
- prec *= 2;
- }
- while (prec < GMP_NUMB_BITS);
- }
-#endif
-
- rp[0] = r0;
- if (n == 1)
- {
- TMP_FREE;
- return;
- }
-
- /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */
- kp1h = k/2 + 1;
-
- /* FIXME: Special case for two limb iteration. */
- rnp = TMP_ALLOC_LIMBS (2*n + 1);
- ep = rnp + n;
-
- /* FIXME: Possible to this on the fly with some bit fiddling. */
- for (i = 0; n > 1; n = (n + 1)/2)
- sizes[i++] = n;
-
- rn = 1;
-
- while (i-- > 0)
- {
- /* Compute x^{k+1}. */
- mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the
- final iteration. */
- mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp);
-
- /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */
-
- mpn_mullo_n (ep, rnp, akm1, sizes[i]);
- ASSERT (mpn_cmp (ep, rp, rn) == 0);
-
- ASSERT (sizes[i] <= 2*rn);
- mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0);
- mpn_neg (rp + rn, rp + rn, sizes[i] - rn);
- rn = sizes[i];
- }
- TMP_FREE;
-}
-
-/* Computes a^{1/k} (mod B^n). Both a and k must be odd. */
-void
-mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
-{
- mp_ptr tp;
- TMP_DECL;
-
- ASSERT (n > 0);
- ASSERT (ap[0] & 1);
- ASSERT (k & 1);
-
- if (k == 1)
- {
- MPN_COPY (rp, ap, n);
- return;
- }
-
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS (n);
-
- mpn_broot_invm1 (tp, ap, n, k);
- mpn_mullo_n (rp, tp, ap, n);
-
- TMP_FREE;
-}
diff --git a/gmp/mpn/generic/brootinv.c b/gmp/mpn/generic/brootinv.c
deleted file mode 100644
index b96c97f1d3..0000000000
--- a/gmp/mpn/generic/brootinv.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/* mpn_brootinv, compute r such that r^k * y = 1 (mod 2^b).
-
- Contributed to the GNU project by Martin Boij (as part of perfpow.c).
-
-Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Computes a^e (mod B). Uses right-to-left binary algorithm, since
- typical use will have e small. */
-static mp_limb_t
-powlimb (mp_limb_t a, mp_limb_t e)
-{
- mp_limb_t r;
-
- for (r = 1; e > 0; e >>= 1, a *= a)
- if (e & 1)
- r *= a;
-
- return r;
-}
-
-/* Compute r such that r^k * y = 1 (mod B^n).
-
- Iterates
- r' <-- k^{-1} ((k+1) r - r^{k+1} y) (mod 2^b)
- using Hensel lifting, each time doubling the number of known bits in r.
-
- Works just for odd k. Else the Hensel lifting degenerates.
-
- FIXME:
-
- (1) Make it work for k == GMP_LIMB_MAX (k+1 below overflows).
-
- (2) Rewrite iteration as
- r' <-- r - k^{-1} r (r^k y - 1)
- and take advantage of the zero low part of r^k y - 1.
-
- (3) Use wrap-around trick.
-
- (4) Use a small table to get starting value.
-
- Scratch need: 5*bn, where bn = ceil (bnb / GMP_NUMB_BITS).
-*/
-
-void
-mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp)
-{
- mp_ptr tp2, tp3;
- mp_limb_t kinv, k2, r0, y0;
- mp_size_t order[GMP_LIMB_BITS + 1];
- int i, d;
-
- ASSERT (bn > 0);
- ASSERT ((k & 1) != 0);
-
- tp2 = tp + bn;
- tp3 = tp + 2 * bn;
- k2 = k + 1;
-
- binvert_limb (kinv, k);
-
- /* 4-bit initial approximation:
-
- y%16 | 1 3 5 7 9 11 13 15,
- k%4 +-------------------------+k2%4
- 1 | 1 11 13 7 9 3 5 15 | 2
- 3 | 1 3 5 7 9 11 13 15 | 0
-
- */
- y0 = yp[0];
-
- r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 2) & 8); /* 4 bits */
- r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7f)); /* 8 bits */
- r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7fff)); /* 16 bits */
-#if GMP_NUMB_BITS > 16
- {
- unsigned prec = 16;
- do
- {
- r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2));
- prec *= 2;
- }
- while (prec < GMP_NUMB_BITS);
- }
-#endif
-
- rp[0] = r0;
- if (bn == 1)
- return;
-
- /* This initialization doesn't matter for the result (any garbage is
- cancelled in the iteration), but proper initialization makes
- valgrind happier. */
- MPN_ZERO (rp+1, bn-1);
-
- d = 0;
- for (; bn > 1; bn = (bn + 1) >> 1)
- order[d++] = bn;
-
- for (i = d - 1; i >= 0; i--)
- {
- bn = order[i];
-
- mpn_mul_1 (tp, rp, bn, k2);
-
- mpn_powlo (tp2, rp, &k2, 1, bn, tp3);
- mpn_mullo_n (rp, yp, tp2, bn);
-
- mpn_sub_n (tp2, tp, rp, bn);
- mpn_pi1_bdiv_q_1 (rp, tp2, bn, k, kinv, 0);
- }
-}
diff --git a/gmp/mpn/generic/bsqrt.c b/gmp/mpn/generic/bsqrt.c
deleted file mode 100644
index 18ba26f440..0000000000
--- a/gmp/mpn/generic/bsqrt.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* mpn_bsqrt, a^{1/2} (mod 2^n).
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-void
-mpn_bsqrt (mp_ptr rp, mp_srcptr ap, mp_bitcnt_t nb, mp_ptr tp)
-{
- mp_ptr sp;
- mp_size_t n;
-
- ASSERT (nb > 0);
-
- n = nb / GMP_NUMB_BITS;
- sp = tp + n;
-
- mpn_bsqrtinv (sp, ap, nb, tp);
- mpn_mullo_n (rp, sp, ap, n);
-}
diff --git a/gmp/mpn/generic/bsqrtinv.c b/gmp/mpn/generic/bsqrtinv.c
deleted file mode 100644
index 33df6a3c15..0000000000
--- a/gmp/mpn/generic/bsqrtinv.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/* mpn_bsqrtinv, compute r such that r^2 * y = 1 (mod 2^{b+1}).
-
- Contributed to the GNU project by Martin Boij (as part of perfpow.c).
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Compute r such that r^2 * y = 1 (mod 2^{b+1}).
- Return non-zero if such an integer r exists.
-
- Iterates
- r' <-- (3r - r^3 y) / 2
- using Hensel lifting. Since we divide by two, the Hensel lifting is
- somewhat degenerates. Therefore, we lift from 2^b to 2^{b+1}-1.
-
- FIXME:
- (1) Simplify to do precision book-keeping in limbs rather than bits.
-
- (2) Rewrite iteration as
- r' <-- r - r (r^2 y - 1) / 2
- and take advantage of zero low part of r^2 y - 1.
-
- (3) Use wrap-around trick.
-
- (4) Use a small table to get starting value.
-*/
-int
-mpn_bsqrtinv (mp_ptr rp, mp_srcptr yp, mp_bitcnt_t bnb, mp_ptr tp)
-{
- mp_ptr tp2, tp3;
- mp_limb_t k;
- mp_size_t bn, order[GMP_LIMB_BITS + 1];
- int i, d;
-
- ASSERT (bnb > 0);
-
- bn = 1 + bnb / GMP_LIMB_BITS;
-
- tp2 = tp + bn;
- tp3 = tp + 2 * bn;
- k = 3;
-
- rp[0] = 1;
- if (bnb == 1)
- {
- if ((yp[0] & 3) != 1)
- return 0;
- }
- else
- {
- if ((yp[0] & 7) != 1)
- return 0;
-
- d = 0;
- for (; bnb != 2; bnb = (bnb + 2) >> 1)
- order[d++] = bnb;
-
- for (i = d - 1; i >= 0; i--)
- {
- bnb = order[i];
- bn = 1 + bnb / GMP_LIMB_BITS;
-
- mpn_mul_1 (tp, rp, bn, k);
-
- mpn_powlo (tp2, rp, &k, 1, bn, tp3);
- mpn_mullo_n (rp, yp, tp2, bn);
-
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (rp, tp, rp, bn);
-#else
- mpn_sub_n (tp2, tp, rp, bn);
- mpn_rshift (rp, tp2, bn, 1);
-#endif
- }
- }
- return 1;
-}
diff --git a/gmp/mpn/generic/cmp.c b/gmp/mpn/generic/cmp.c
index 18c7b42844..d352076599 100644
--- a/gmp/mpn/generic/cmp.c
+++ b/gmp/mpn/generic/cmp.c
@@ -5,28 +5,17 @@ Copyright 1991, 1993, 1994, 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define __GMP_FORCE_mpn_cmp 1
diff --git a/gmp/mpn/generic/cnd_add_n.c b/gmp/mpn/generic/cnd_add_n.c
deleted file mode 100644
index 443f9858da..0000000000
--- a/gmp/mpn/generic/cnd_add_n.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* mpn_cnd_add_n -- Compute R = U + V if CND != 0 or R = U if CND == 0.
- Both cases should take the same time and perform the exact same memory
- accesses, since this function is intended to be used where side-channel
- attack resilience is relevant.
-
-Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-mp_limb_t
-mpn_cnd_add_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
-{
- mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
-
- mask = -(mp_limb_t) (cnd != 0);
- cy = 0;
- do
- {
- ul = *up++;
- vl = *vp++ & mask;
-#if GMP_NAIL_BITS == 0
- sl = ul + vl;
- cy1 = sl < ul;
- rl = sl + cy;
- cy2 = rl < sl;
- cy = cy1 | cy2;
- *rp++ = rl;
-#else
- rl = ul + vl;
- rl += cy;
- cy = rl >> GMP_NUMB_BITS;
- *rp++ = rl & GMP_NUMB_MASK;
-#endif
- }
- while (--n != 0);
-
- return cy;
-}
diff --git a/gmp/mpn/generic/cnd_sub_n.c b/gmp/mpn/generic/cnd_sub_n.c
deleted file mode 100644
index bd8e029a36..0000000000
--- a/gmp/mpn/generic/cnd_sub_n.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* mpn_cnd_sub_n -- Compute R = U - V if CND != 0 or R = U if CND == 0.
- Both cases should take the same time and perform the exact same memory
- accesses, since this function is intended to be used where side-channel
- attack resilience is relevant.
-
-Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-mp_limb_t
-mpn_cnd_sub_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
-{
- mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
-
- mask = -(mp_limb_t) (cnd != 0);
- cy = 0;
- do
- {
- ul = *up++;
- vl = *vp++ & mask;
-#if GMP_NAIL_BITS == 0
- sl = ul - vl;
- cy1 = sl > ul;
- rl = sl - cy;
- cy2 = rl > sl;
- cy = cy1 | cy2;
- *rp++ = rl;
-#else
- rl = ul - vl;
- rl -= cy;
- cy = rl >> (GMP_LIMB_BITS - 1);
- *rp++ = rl & GMP_NUMB_MASK;
-#endif
- }
- while (--n != 0);
-
- return cy;
-}
diff --git a/gmp/mpn/generic/com.c b/gmp/mpn/generic/com.c
deleted file mode 100644
index cd8551df5b..0000000000
--- a/gmp/mpn/generic/com.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* mpn_com - complement an mpn.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#undef mpn_com
-#define mpn_com __MPN(com)
-
-void
-mpn_com (mp_ptr rp, mp_srcptr up, mp_size_t n)
-{
- mp_limb_t ul;
- do {
- ul = *up++;
- *rp++ = ~ul & GMP_NUMB_MASK;
- } while (--n != 0);
-}
diff --git a/gmp/mpn/generic/comb_tables.c b/gmp/mpn/generic/comb_tables.c
deleted file mode 100644
index 41bcb5f879..0000000000
--- a/gmp/mpn/generic/comb_tables.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Const tables shared among combinatoric functions.
-
- THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO
- BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Entry i contains (i!/2^t) where t is chosen such that the parenthesis
- is an odd integer. */
-const mp_limb_t __gmp_oddfac_table[] = { ONE_LIMB_ODD_FACTORIAL_TABLE, ONE_LIMB_ODD_FACTORIAL_EXTTABLE };
-
-/* Entry i contains ((2i+1)!!/2^t) where t is chosen such that the parenthesis
- is an odd integer. */
-const mp_limb_t __gmp_odd2fac_table[] = { ONE_LIMB_ODD_DOUBLEFACTORIAL_TABLE };
-
-/* Entry i contains 2i-popc(2i). */
-const unsigned char __gmp_fac2cnt_table[] = { TABLE_2N_MINUS_POPC_2N };
-
-const mp_limb_t __gmp_limbroots_table[] = { NTH_ROOT_NUMB_MASK_TABLE };
diff --git a/gmp/mpn/generic/copyd.c b/gmp/mpn/generic/copyd.c
deleted file mode 100644
index ba3380a82b..0000000000
--- a/gmp/mpn/generic/copyd.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/* mpn_copyd
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_copyd (mp_ptr rp, mp_srcptr up, mp_size_t n)
-{
- mp_size_t i;
-
- for (i = n - 1; i >= 0; i--)
- rp[i] = up[i];
-}
diff --git a/gmp/mpn/generic/copyi.c b/gmp/mpn/generic/copyi.c
deleted file mode 100644
index 0c39b4534b..0000000000
--- a/gmp/mpn/generic/copyi.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* mpn_copyi
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_copyi (mp_ptr rp, mp_srcptr up, mp_size_t n)
-{
- mp_size_t i;
-
- up += n;
- rp += n;
- for (i = -n; i != 0; i++)
- rp[i] = up[i];
-}
diff --git a/gmp/mpn/generic/dc_bdiv_q.c b/gmp/mpn/generic/dc_bdiv_q.c
new file mode 100644
index 0000000000..9a43d18b56
--- /dev/null
+++ b/gmp/mpn/generic/dc_bdiv_q.c
@@ -0,0 +1,137 @@
+/* mpn_dc_bdiv_q -- divide-and-conquer Hensel division with precomputed
+ inverse, returning quotient.
+
+ Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Computes Q = N / D mod B^n, destroys N. */
+
+mp_size_t
+mpn_dc_bdiv_q_n_itch (mp_size_t n)
+{
+ /* NOTE: Depends om mullow_n interface */
+ return n;
+}
+
+void
+mpn_dc_bdiv_q_n (mp_ptr qp,
+ mp_ptr np, mp_srcptr dp, mp_size_t n,
+ mp_limb_t dinv, mp_ptr tp)
+{
+ while (ABOVE_THRESHOLD (n, DC_BDIV_Q_THRESHOLD))
+ {
+ mp_limb_t l, h;
+ mp_limb_t cy;
+
+ l = n >> 1;
+ h = n - l;
+
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, l, dinv, tp);
+
+ mpn_mullow_n (tp, qp, dp + h, l);
+ mpn_sub_n (np + h, np + h, tp, l);
+
+ if (l < h)
+ {
+ cy += mpn_submul_1 (np + l, qp, l, dp[l]);
+ np[n - 1] -= cy;
+ }
+ qp += l;
+ np += l;
+ n -= l;
+ }
+ mpn_sb_bdiv_q (qp, np, n, dp, n, dinv);
+}
+
+void
+mpn_dc_bdiv_q (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_limb_t dinv)
+{
+ mp_size_t qn;
+ mp_limb_t cy;
+ mp_ptr tp;
+ TMP_DECL;
+
+ TMP_MARK;
+
+ tp = TMP_SALLOC_LIMBS (dn);
+
+ qn = nn;
+
+ if (qn > dn)
+ {
+ /* Reduce qn mod dn in a super-efficient manner. */
+ do
+ qn -= dn;
+ while (qn > dn);
+
+ /* Perform the typically smaller block first. */
+ if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
+ cy = mpn_sb_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+ else
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+
+ if (qn != dn)
+ {
+ if (qn > dn - qn)
+ mpn_mul (tp, qp, qn, dp + qn, dn - qn);
+ else
+ mpn_mul (tp, dp + qn, dn - qn, qp, qn);
+ mpn_incr_u (tp + qn, cy);
+
+ mpn_sub (np + qn, np + qn, nn - qn, tp, dn);
+ cy = 0;
+ }
+
+ np += qn;
+ qp += qn;
+
+ qn = nn - qn;
+ while (qn > dn)
+ {
+ mpn_sub_1 (np + dn, np + dn, qn, cy);
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
+ qp += dn;
+ np += dn;
+ qn -= dn;
+ }
+ mpn_sub_1 (np + dn, np + dn, qn, cy);
+ mpn_dc_bdiv_q_n (qp, np, dp, dn, dinv, tp);
+ TMP_FREE;
+ return;
+ }
+
+ if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD))
+ mpn_sb_bdiv_q (qp, np, 2 * qn, dp, qn, dinv);
+ else
+ mpn_dc_bdiv_q_n (qp, np, dp, qn, dinv, tp);
+
+ TMP_FREE;
+}
diff --git a/gmp/mpn/generic/dcpi1_bdiv_qr.c b/gmp/mpn/generic/dc_bdiv_qr.c
index 8a251f8d9d..8b59bbd860 100644
--- a/gmp/mpn/generic/dcpi1_bdiv_qr.c
+++ b/gmp/mpn/generic/dc_bdiv_qr.c
@@ -1,39 +1,29 @@
-/* mpn_dcpi1_bdiv_qr -- divide-and-conquer Hensel division with precomputed
+/* mpn_dc_bdiv_qr -- divide-and-conquer Hensel division with precomputed
inverse, returning quotient and remainder.
- Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+ Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -53,14 +43,14 @@ see https://www.gnu.org/licenses/. */
d must be odd. dinv is (-d)^-1 mod 2^GMP_NUMB_BITS. */
mp_size_t
-mpn_dcpi1_bdiv_qr_n_itch (mp_size_t n)
+mpn_dc_bdiv_qr_n_itch (mp_size_t n)
{
return n;
}
mp_limb_t
-mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
- mp_limb_t dinv, mp_ptr tp)
+mpn_dc_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+ mp_limb_t dinv, mp_ptr tp)
{
mp_size_t lo, hi;
mp_limb_t cy;
@@ -70,9 +60,9 @@ mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
hi = n - lo; /* ceil(n/2) */
if (BELOW_THRESHOLD (lo, DC_BDIV_QR_THRESHOLD))
- cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * lo, dp, lo, dinv);
+ cy = mpn_sb_bdiv_qr (qp, np, 2 * lo, dp, lo, dinv);
else
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp);
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, lo, dinv, tp);
mpn_mul (tp, dp + lo, hi, qp, lo);
@@ -80,9 +70,9 @@ mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
rh = mpn_sub (np + lo, np + lo, n + hi, tp, n);
if (BELOW_THRESHOLD (hi, DC_BDIV_QR_THRESHOLD))
- cy = mpn_sbpi1_bdiv_qr (qp + lo, np + lo, 2 * hi, dp, hi, dinv);
+ cy = mpn_sb_bdiv_qr (qp + lo, np + lo, 2 * hi, dp, hi, dinv);
else
- cy = mpn_dcpi1_bdiv_qr_n (qp + lo, np + lo, dp, hi, dinv, tp);
+ cy = mpn_dc_bdiv_qr_n (qp + lo, np + lo, dp, hi, dinv, tp);
mpn_mul (tp, qp + lo, hi, dp + hi, lo);
@@ -93,8 +83,8 @@ mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
}
mp_limb_t
-mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+mpn_dc_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn,
+ mp_limb_t dinv)
{
mp_size_t qn;
mp_limb_t rr, cy;
@@ -103,10 +93,6 @@ mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
TMP_MARK;
- ASSERT (dn >= 2); /* to adhere to mpn_sbpi1_div_qr's limits */
- ASSERT (nn - dn >= 1); /* to adhere to mpn_sbpi1_div_qr's limits */
- ASSERT (dp[0] & 1);
-
tp = TMP_SALLOC_LIMBS (dn);
qn = nn - dn;
@@ -120,9 +106,9 @@ mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
/* Perform the typically smaller block first. */
if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
- cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+ cy = mpn_sb_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
else
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
rr = 0;
if (qn != dn)
@@ -144,7 +130,7 @@ mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
do
{
rr += mpn_sub_1 (np + dn, np + dn, qn, cy);
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
qp += dn;
np += dn;
qn -= dn;
@@ -155,9 +141,9 @@ mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
}
if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
- cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+ cy = mpn_sb_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
else
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+ cy = mpn_dc_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
rr = 0;
if (qn != dn)
diff --git a/gmp/mpn/generic/dc_div_q.c b/gmp/mpn/generic/dc_div_q.c
new file mode 100644
index 0000000000..276ae4fba6
--- /dev/null
+++ b/gmp/mpn/generic/dc_div_q.c
@@ -0,0 +1,57 @@
+/* mpn_dc_div_q -- divide-and-conquer division, returning exact quotient only.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+mp_limb_t
+mpn_dc_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+{
+ mp_ptr tp, wp;
+ mp_limb_t qh;
+ mp_size_t qn;
+ TMP_DECL;
+
+ TMP_MARK;
+
+ tp = TMP_SALLOC_LIMBS (nn + 1);
+ MPN_COPY (tp + 1, np, nn);
+ tp[0] = 0;
+
+ qn = nn - dn;
+ wp = TMP_SALLOC_LIMBS (qn + 1);
+
+ qh = mpn_dc_divappr_q (wp, tp, nn + 1, dp, dn);
+
+ if (wp[0] == 0)
+ /* FIXME: Should multiply and subtract here, not recompute from scratch. */
+ qh = mpn_dc_div_qr (qp, np, nn, dp, dn);
+ else
+ MPN_COPY (qp, wp + 1, qn);
+
+ return qh;
+}
diff --git a/gmp/mpn/generic/dc_div_qr.c b/gmp/mpn/generic/dc_div_qr.c
new file mode 100644
index 0000000000..41a46f1516
--- /dev/null
+++ b/gmp/mpn/generic/dc_div_qr.c
@@ -0,0 +1,203 @@
+/* mpn_dc_div_qr -- recursive divide-and-conquer division for arbitrary size
+ operands.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+mp_limb_t
+mpn_dc_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+ mp_srcptr dip, mp_ptr tp)
+{
+ mp_size_t lo, hi;
+ mp_limb_t cy, qh, ql;
+
+ lo = n >> 1; /* floor(n/2) */
+ hi = n - lo; /* ceil(n/2) */
+
+ if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
+ qh = mpn_sb_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dip);
+ else
+ qh = mpn_dc_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dip, tp);
+
+ mpn_mul (tp, qp + lo, hi, dp, lo);
+
+ cy = mpn_sub_n (np + lo, np + lo, tp, n);
+ if (qh != 0)
+ cy += mpn_sub_n (np + n, np + n, dp, lo);
+
+ while (cy != 0)
+ {
+ qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
+ cy -= mpn_add_n (np + lo, np + lo, dp, n);
+ }
+
+ if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD))
+ ql = mpn_sb_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dip);
+ else
+ ql = mpn_dc_div_qr_n (qp, np + hi, dp + hi, lo, dip, tp);
+
+ mpn_mul (tp, dp, hi, qp, lo);
+
+ cy = mpn_sub_n (np, np, tp, n);
+ if (ql != 0)
+ cy += mpn_sub_n (np + lo, np + lo, dp, hi);
+
+ while (cy != 0)
+ {
+ mpn_sub_1 (qp, qp, lo, 1);
+ cy -= mpn_add_n (np, np, dp, n);
+ }
+
+ return qh;
+}
+
+mp_limb_t
+mpn_preinv_dc_div_qr (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_srcptr dip)
+{
+ mp_size_t qn;
+ mp_limb_t qh, cy;
+ mp_ptr tp;
+ TMP_DECL;
+
+ TMP_MARK;
+
+ tp = TMP_SALLOC_LIMBS (dn);
+
+ qn = nn - dn;
+ qp += qn;
+ np += nn;
+ dp += dn;
+
+ if (qn > dn)
+ {
+ /* Reduce qn mod dn without division, optimizing small operations. */
+ do
+ qn -= dn;
+ while (qn > dn);
+
+ qp -= qn; /* point at low limb of next quotient block */
+ np -= qn; /* point in the middle of partial remainder */
+
+ /* Perform the typically smaller block first. */
+ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+ qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dip);
+ else
+ qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dip, tp);
+
+ if (qn != dn)
+ {
+ if (qn > dn - qn)
+ mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+ else
+ mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+ cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+ if (qh != 0)
+ cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+ while (cy != 0)
+ {
+ qh -= mpn_sub_1 (qp, qp, qn, 1);
+ cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+ }
+ }
+
+ qn = nn - dn - qn;
+ do
+ {
+ qp -= dn;
+ np -= dn;
+ mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dip, tp);
+ qn -= dn;
+ }
+ while (qn > 0);
+ }
+ else
+ {
+ if (qn == 0)
+ {
+ qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
+ if (qh)
+ mpn_sub_n (np - dn, np - dn, dp - dn, dn);
+ TMP_FREE;
+ return qh;
+ }
+
+ qp -= qn; /* point at low limb of next quotient block */
+ np -= qn; /* point in the middle of partial remainder */
+
+ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+ qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dip);
+ else
+ qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dip, tp);
+
+ if (qn != dn)
+ {
+ if (qn > dn - qn)
+ mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+ else
+ mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+ cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+ if (qh != 0)
+ cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+ while (cy != 0)
+ {
+ qh -= mpn_sub_1 (qp, qp, qn, 1);
+ cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+ }
+ }
+ }
+
+ TMP_FREE;
+ return qh;
+}
+
+mp_limb_t
+mpn_dc_div_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+{
+ mp_limb_t cy;
+ mp_limb_t xp[2], dip[2];
+
+ ASSERT (dn >= 2);
+
+ cy = mpn_add_1 (xp, dp + dn - 2, 2, 1);
+ if (cy != 0)
+ dip[0] = dip[1] = 0;
+ else
+ {
+ mp_limb_t scratch[10]; /* FIXME */
+ mpn_invert (dip, xp, 2, scratch);
+ }
+
+ return mpn_preinv_dc_div_qr (qp, np, nn, dp, dn, dip);
+}
diff --git a/gmp/mpn/generic/dc_divappr_q.c b/gmp/mpn/generic/dc_divappr_q.c
new file mode 100644
index 0000000000..4474872388
--- /dev/null
+++ b/gmp/mpn/generic/dc_divappr_q.c
@@ -0,0 +1,196 @@
+/* mpn_dc_divappr_q -- divide-and-conquer division, returning only approximate
+ quotient. The quotient retuened is either correct, or unity too large.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+mp_limb_t
+mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+ mp_srcptr dip, mp_ptr tp)
+{
+ mp_size_t lo, hi;
+ mp_limb_t cy, qh, ql;
+
+ lo = n >> 1; /* floor(n/2) */
+ hi = n - lo; /* ceil(n/2) */
+
+ if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
+ qh = mpn_sb_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dip);
+ else
+ qh = mpn_dc_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dip, tp);
+
+ mpn_mul (tp, qp + lo, hi, dp, lo);
+
+ cy = mpn_sub_n (np + lo, np + lo, tp, n);
+ if (qh != 0)
+ cy += mpn_sub_n (np + n, np + n, dp, lo);
+
+ while (cy != 0)
+ {
+ qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
+ cy -= mpn_add_n (np + lo, np + lo, dp, n);
+ }
+
+ if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD))
+ ql = mpn_sb_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dip);
+ else
+ ql = mpn_dc_divappr_q_n (qp, np + hi, dp + hi, lo, dip, tp);
+
+ if (UNLIKELY (ql != 0))
+ {
+ mp_size_t i;
+ for (i = 0; i < lo; i++)
+ qp[i] = GMP_NUMB_MASK;
+ }
+
+ return qh;
+}
+
+mp_limb_t
+mpn_preinv_dc_divappr_q (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_srcptr dip)
+{
+ mp_size_t qn;
+ mp_limb_t qh, cy, qsave;
+ mp_ptr tp;
+ TMP_DECL;
+
+ TMP_MARK;
+
+ tp = TMP_SALLOC_LIMBS (dn+1);
+
+ qn = nn - dn;
+ qp += qn;
+ np += nn;
+ dp += dn;
+
+ if (qn > dn)
+ {
+ qn++; /* pretend we'll need an extra limb */
+ /* Reduce qn mod dn without division, optimizing small operations. */
+ do
+ qn -= dn;
+ while (qn > dn);
+
+ qp -= qn; /* point at low limb of next quotient block */
+ np -= qn; /* point in the middle of partial remainder */
+
+ /* Perform the typically smaller block first. */
+ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+ qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dip);
+ else
+ qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dip, tp);
+
+ if (qn != dn)
+ {
+ if (qn > dn - qn)
+ mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+ else
+ mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+ cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+ if (qh != 0)
+ cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+ while (cy != 0)
+ {
+ qh -= mpn_sub_1 (qp, qp, qn, 1);
+ cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+ }
+ }
+
+ qn = nn - dn - qn + 1;
+ while (qn > dn)
+ {
+ qp -= dn;
+ np -= dn;
+ mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dip, tp);
+ qn -= dn;
+ }
+
+ /* Since we pretended we'd need an extra quotient limb before, we now
+ have made sure the code above left just dn-1=qn quotient limbs to
+ develop. Develop that plus a guard limb. */
+ qn--;
+ qp -= qn;
+ np -= dn;
+ qsave = qp[qn];
+ mpn_dc_divappr_q_n (qp, np - dn, dp - dn, dn, dip, tp);
+ MPN_COPY_INCR (qp, qp + 1, qn);
+ qp[qn] = qsave;
+ }
+ else
+ {
+ if (qn == 0)
+ {
+ qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
+ if (qh)
+ mpn_sub_n (np - dn, np - dn, dp - dn, dn);
+ TMP_FREE;
+ return qh;
+ }
+
+ qp -= qn; /* point at low limb of next quotient block */
+ np -= qn; /* point in the middle of partial remainder */
+
+ if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
+ /* Full precision. Optimal? */
+ qh = mpn_sb_divappr_q (qp, np - dn, nn, dp - dn, dn, dip);
+ else
+ {
+ /* Put quotient in tp, use qp as temporary, since qp lacks a limb. */
+ qh = mpn_dc_divappr_q_n (tp, np - qn - 2, dp - (qn + 1), qn + 1, dip, qp);
+ MPN_COPY (qp, tp + 1, qn);
+ }
+ }
+
+ TMP_FREE;
+ return qh;
+}
+
+mp_limb_t
+mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+{
+ mp_limb_t cy;
+ mp_limb_t xp[2], dip[2];
+
+ ASSERT (dn >= 2);
+
+ cy = mpn_add_1 (xp, dp + dn - 2, 2, 1);
+ if (cy != 0)
+ dip[0] = dip[1] = 0;
+ else
+ {
+ mp_limb_t scratch[10]; /* FIXME */
+ mpn_invert (dip, xp, 2, scratch);
+ }
+
+ return mpn_preinv_dc_divappr_q (qp, np, nn, dp, dn, dip);
+}
diff --git a/gmp/mpn/generic/dc_divrem_n.c b/gmp/mpn/generic/dc_divrem_n.c
new file mode 100644
index 0000000000..61ddde72c3
--- /dev/null
+++ b/gmp/mpn/generic/dc_divrem_n.c
@@ -0,0 +1,121 @@
+/* mpn_dc_divrem_n and auxilliary routines.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright 2000, 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+ Technical report MPI-I-98-1-022, october 1998.
+ http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_dc_div_3_by_2
+ __GMP_PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch));
+static mp_limb_t mpn_dc_div_2_by_1
+ __GMP_PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch));
+
+/* mpn_dc_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+ by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+ Returns most significant limb of the quotient, which is 0 or 1.
+ Requires that the most significant bit of the divisor is set. */
+
+mp_limb_t
+mpn_dc_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+{
+ mp_limb_t ret;
+ mp_ptr scratch;
+ TMP_DECL;
+ TMP_MARK;
+
+ scratch = TMP_ALLOC_LIMBS (n);
+ ret = mpn_dc_div_2_by_1 (qp, np, dp, n, scratch);
+
+ TMP_FREE;
+ return ret;
+}
+
+static mp_limb_t
+mpn_dc_div_2_by_1 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
+{
+ mp_limb_t qhl, cc;
+ mp_size_t n2 = n/2;
+
+ if (n % 2 != 0)
+ {
+ mp_ptr qp1 = qp + 1;
+ qhl = mpn_dc_div_3_by_2 (qp1 + n2, np + 2 + n2, dp + 1, n2, scratch);
+ qhl += mpn_add_1 (qp1 + n2, qp1 + n2, n2,
+ mpn_dc_div_3_by_2 (qp1, np + 2, dp + 1, n2, scratch));
+
+ cc = mpn_submul_1 (np + 1, qp1, n - 1, dp[0]);
+ cc = mpn_sub_1 (np + n, np + n, 1, cc);
+ if (qhl != 0)
+ cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+ while (cc != 0)
+ {
+ qhl -= mpn_sub_1 (qp1, qp1, n - 1, (mp_limb_t) 1);
+ cc -= mpn_add_n (np + 1, np + 1, dp, n);
+ }
+ qhl += mpn_add_1 (qp1, qp1, n - 1,
+ mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+ }
+ else
+ {
+ qhl = mpn_dc_div_3_by_2 (qp + n2, np + n2, dp, n2, scratch);
+ qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+ mpn_dc_div_3_by_2 (qp, np, dp, n2, scratch));
+ }
+ return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+ the remainder in (np, 2n) */
+
+static mp_limb_t
+mpn_dc_div_3_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
+{
+ mp_size_t twon = n + n;
+ mp_limb_t qhl, cc;
+
+ if (n < DIV_DC_THRESHOLD)
+ qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+ else
+ qhl = mpn_dc_div_2_by_1 (qp, np + n, dp + n, n, scratch);
+
+ mpn_mul_n (scratch, qp, dp, n);
+ cc = mpn_sub_n (np, np, scratch, twon);
+
+ if (qhl != 0)
+ cc += mpn_sub_n (np + n, np + n, dp, n);
+ while (cc != 0)
+ {
+ qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+ cc -= mpn_add_n (np, np, dp, twon);
+ }
+ return qhl;
+}
diff --git a/gmp/mpn/generic/dcpi1_bdiv_q.c b/gmp/mpn/generic/dcpi1_bdiv_q.c
deleted file mode 100644
index a7b86c96d4..0000000000
--- a/gmp/mpn/generic/dcpi1_bdiv_q.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/* mpn_dcpi1_bdiv_q -- divide-and-conquer Hensel division with precomputed
- inverse, returning quotient.
-
- Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-mp_size_t
-mpn_dcpi1_bdiv_q_n_itch (mp_size_t n)
-{
- /* NOTE: Depends on mullo_n interface */
- return n;
-}
-
-/* Computes Q = N / D mod B^n, destroys N.
-
- N = {np,n}
- D = {dp,n}
-*/
-
-void
-mpn_dcpi1_bdiv_q_n (mp_ptr qp,
- mp_ptr np, mp_srcptr dp, mp_size_t n,
- mp_limb_t dinv, mp_ptr tp)
-{
- while (ABOVE_THRESHOLD (n, DC_BDIV_Q_THRESHOLD))
- {
- mp_size_t lo, hi;
- mp_limb_t cy;
-
- lo = n >> 1; /* floor(n/2) */
- hi = n - lo; /* ceil(n/2) */
-
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp);
-
- mpn_mullo_n (tp, qp, dp + hi, lo);
- mpn_sub_n (np + hi, np + hi, tp, lo);
-
- if (lo < hi)
- {
- cy += mpn_submul_1 (np + lo, qp, lo, dp[lo]);
- np[n - 1] -= cy;
- }
- qp += lo;
- np += lo;
- n -= lo;
- }
- mpn_sbpi1_bdiv_q (qp, np, n, dp, n, dinv);
-}
-
-/* Computes Q = N / D mod B^nn, destroys N.
-
- N = {np,nn}
- D = {dp,dn}
-*/
-
-void
-mpn_dcpi1_bdiv_q (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv)
-{
- mp_size_t qn;
- mp_limb_t cy;
- mp_ptr tp;
- TMP_DECL;
-
- TMP_MARK;
-
- ASSERT (dn >= 2);
- ASSERT (nn - dn >= 0);
- ASSERT (dp[0] & 1);
-
- tp = TMP_SALLOC_LIMBS (dn);
-
- qn = nn;
-
- if (qn > dn)
- {
- /* Reduce qn mod dn in a super-efficient manner. */
- do
- qn -= dn;
- while (qn > dn);
-
- /* Perform the typically smaller block first. */
- if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
- cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
- else
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
-
- if (qn != dn)
- {
- if (qn > dn - qn)
- mpn_mul (tp, qp, qn, dp + qn, dn - qn);
- else
- mpn_mul (tp, dp + qn, dn - qn, qp, qn);
- mpn_incr_u (tp + qn, cy);
-
- mpn_sub (np + qn, np + qn, nn - qn, tp, dn);
- cy = 0;
- }
-
- np += qn;
- qp += qn;
-
- qn = nn - qn;
- while (qn > dn)
- {
- mpn_sub_1 (np + dn, np + dn, qn - dn, cy);
- cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
- qp += dn;
- np += dn;
- qn -= dn;
- }
- mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp);
- }
- else
- {
- if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD))
- mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv);
- else
- mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp);
- }
-
- TMP_FREE;
-}
diff --git a/gmp/mpn/generic/dcpi1_div_q.c b/gmp/mpn/generic/dcpi1_div_q.c
deleted file mode 100644
index 32d74c31a9..0000000000
--- a/gmp/mpn/generic/dcpi1_div_q.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/* mpn_dc_div_q -- divide-and-conquer division, returning exact quotient
- only.
-
- Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-mp_limb_t
-mpn_dcpi1_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
-{
- mp_ptr tp, wp;
- mp_limb_t qh;
- mp_size_t qn;
- TMP_DECL;
-
- TMP_MARK;
-
- ASSERT (dn >= 6);
- ASSERT (nn - dn >= 3);
- ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
-
- tp = TMP_SALLOC_LIMBS (nn + 1);
- MPN_COPY (tp + 1, np, nn);
- tp[0] = 0;
-
- qn = nn - dn;
- wp = TMP_SALLOC_LIMBS (qn + 1);
-
- qh = mpn_dcpi1_divappr_q (wp, tp, nn + 1, dp, dn, dinv);
-
- if (wp[0] == 0)
- {
- mp_limb_t cy;
-
- if (qn > dn)
- mpn_mul (tp, wp + 1, qn, dp, dn);
- else
- mpn_mul (tp, dp, dn, wp + 1, qn);
-
- cy = (qh != 0) ? mpn_add_n (tp + qn, tp + qn, dp, dn) : 0;
-
- if (cy || mpn_cmp (tp, np, nn) > 0) /* At most is wrong by one, no cycle. */
- qh -= mpn_sub_1 (qp, wp + 1, qn, 1);
- else /* Same as below */
- MPN_COPY (qp, wp + 1, qn);
- }
- else
- MPN_COPY (qp, wp + 1, qn);
-
- TMP_FREE;
- return qh;
-}
diff --git a/gmp/mpn/generic/dcpi1_div_qr.c b/gmp/mpn/generic/dcpi1_div_qr.c
deleted file mode 100644
index 4d80c7b769..0000000000
--- a/gmp/mpn/generic/dcpi1_div_qr.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/* mpn_dcpi1_div_qr_n -- recursive divide-and-conquer division for arbitrary
- size operands.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-mp_limb_t
-mpn_dcpi1_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
- gmp_pi1_t *dinv, mp_ptr tp)
-{
- mp_size_t lo, hi;
- mp_limb_t cy, qh, ql;
-
- lo = n >> 1; /* floor(n/2) */
- hi = n - lo; /* ceil(n/2) */
-
- if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
- qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32);
- else
- qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp);
-
- mpn_mul (tp, qp + lo, hi, dp, lo);
-
- cy = mpn_sub_n (np + lo, np + lo, tp, n);
- if (qh != 0)
- cy += mpn_sub_n (np + n, np + n, dp, lo);
-
- while (cy != 0)
- {
- qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
- cy -= mpn_add_n (np + lo, np + lo, dp, n);
- }
-
- if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD))
- ql = mpn_sbpi1_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32);
- else
- ql = mpn_dcpi1_div_qr_n (qp, np + hi, dp + hi, lo, dinv, tp);
-
- mpn_mul (tp, dp, hi, qp, lo);
-
- cy = mpn_sub_n (np, np, tp, n);
- if (ql != 0)
- cy += mpn_sub_n (np + lo, np + lo, dp, hi);
-
- while (cy != 0)
- {
- mpn_sub_1 (qp, qp, lo, 1);
- cy -= mpn_add_n (np, np, dp, n);
- }
-
- return qh;
-}
-
-mp_limb_t
-mpn_dcpi1_div_qr (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- gmp_pi1_t *dinv)
-{
- mp_size_t qn;
- mp_limb_t qh, cy;
- mp_ptr tp;
- TMP_DECL;
-
- TMP_MARK;
-
- ASSERT (dn >= 6); /* to adhere to mpn_sbpi1_div_qr's limits */
- ASSERT (nn - dn >= 3); /* to adhere to mpn_sbpi1_div_qr's limits */
- ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
-
- tp = TMP_SALLOC_LIMBS (dn);
-
- qn = nn - dn;
- qp += qn;
- np += nn;
- dp += dn;
-
- if (qn > dn)
- {
- /* Reduce qn mod dn without division, optimizing small operations. */
- do
- qn -= dn;
- while (qn > dn);
-
- qp -= qn; /* point at low limb of next quotient block */
- np -= qn; /* point in the middle of partial remainder */
-
- /* Perform the typically smaller block first. */
- if (qn == 1)
- {
- mp_limb_t q, n2, n1, n0, d1, d0;
-
- /* Handle qh up front, for simplicity. */
- qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
- if (qh)
- ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));
-
- /* A single iteration of schoolbook: One 3/2 division,
- followed by the bignum update and adjustment. */
- n2 = np[0];
- n1 = np[-1];
- n0 = np[-2];
- d1 = dp[-1];
- d0 = dp[-2];
-
- ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));
-
- if (UNLIKELY (n2 == d1) && n1 == d0)
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
- ASSERT (cy == n2);
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);
-
- if (dn > 2)
- {
- mp_limb_t cy, cy1;
- cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 = (n1 - cy1) & GMP_NUMB_MASK;
- np[-2] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
- qh -= (q == 0);
- q = (q - 1) & GMP_NUMB_MASK;
- }
- }
- else
- np[-2] = n0;
-
- np[-1] = n1;
- }
- qp[0] = q;
- }
- else
- {
- /* Do a 2qn / qn division */
- if (qn == 2)
- qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */
- else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
- qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
- else
- qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
-
- if (qn != dn)
- {
- if (qn > dn - qn)
- mpn_mul (tp, qp, qn, dp - dn, dn - qn);
- else
- mpn_mul (tp, dp - dn, dn - qn, qp, qn);
-
- cy = mpn_sub_n (np - dn, np - dn, tp, dn);
- if (qh != 0)
- cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
-
- while (cy != 0)
- {
- qh -= mpn_sub_1 (qp, qp, qn, 1);
- cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
- }
- }
- }
-
- qn = nn - dn - qn;
- do
- {
- qp -= dn;
- np -= dn;
- mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
- qn -= dn;
- }
- while (qn > 0);
- }
- else
- {
- qp -= qn; /* point at low limb of next quotient block */
- np -= qn; /* point in the middle of partial remainder */
-
- if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
- qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
- else
- qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
-
- if (qn != dn)
- {
- if (qn > dn - qn)
- mpn_mul (tp, qp, qn, dp - dn, dn - qn);
- else
- mpn_mul (tp, dp - dn, dn - qn, qp, qn);
-
- cy = mpn_sub_n (np - dn, np - dn, tp, dn);
- if (qh != 0)
- cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
-
- while (cy != 0)
- {
- qh -= mpn_sub_1 (qp, qp, qn, 1);
- cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
- }
- }
- }
-
- TMP_FREE;
- return qh;
-}
diff --git a/gmp/mpn/generic/dcpi1_divappr_q.c b/gmp/mpn/generic/dcpi1_divappr_q.c
deleted file mode 100644
index c7b03c7f49..0000000000
--- a/gmp/mpn/generic/dcpi1_divappr_q.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/* mpn_dcpi1_divappr_q -- divide-and-conquer division, returning approximate
- quotient. The quotient returned is either correct, or one too large.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-mp_limb_t
-mpn_dcpi1_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
- gmp_pi1_t *dinv, mp_ptr tp)
-{
- mp_size_t lo, hi;
- mp_limb_t cy, qh, ql;
-
- lo = n >> 1; /* floor(n/2) */
- hi = n - lo; /* ceil(n/2) */
-
- if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
- qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32);
- else
- qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp);
-
- mpn_mul (tp, qp + lo, hi, dp, lo);
-
- cy = mpn_sub_n (np + lo, np + lo, tp, n);
- if (qh != 0)
- cy += mpn_sub_n (np + n, np + n, dp, lo);
-
- while (cy != 0)
- {
- qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
- cy -= mpn_add_n (np + lo, np + lo, dp, n);
- }
-
- if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD))
- ql = mpn_sbpi1_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32);
- else
- ql = mpn_dcpi1_divappr_q_n (qp, np + hi, dp + hi, lo, dinv, tp);
-
- if (UNLIKELY (ql != 0))
- {
- mp_size_t i;
- for (i = 0; i < lo; i++)
- qp[i] = GMP_NUMB_MASK;
- }
-
- return qh;
-}
-
-mp_limb_t
-mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
-{
- mp_size_t qn;
- mp_limb_t qh, cy, qsave;
- mp_ptr tp;
- TMP_DECL;
-
- TMP_MARK;
-
- ASSERT (dn >= 6);
- ASSERT (nn > dn);
- ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
-
- qn = nn - dn;
- qp += qn;
- np += nn;
- dp += dn;
-
- if (qn >= dn)
- {
- qn++; /* pretend we'll need an extra limb */
- /* Reduce qn mod dn without division, optimizing small operations. */
- do
- qn -= dn;
- while (qn > dn);
-
- qp -= qn; /* point at low limb of next quotient block */
- np -= qn; /* point in the middle of partial remainder */
-
- tp = TMP_SALLOC_LIMBS (dn);
-
- /* Perform the typically smaller block first. */
- if (qn == 1)
- {
- mp_limb_t q, n2, n1, n0, d1, d0;
-
- /* Handle qh up front, for simplicity. */
- qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
- if (qh)
- ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));
-
- /* A single iteration of schoolbook: One 3/2 division,
- followed by the bignum update and adjustment. */
- n2 = np[0];
- n1 = np[-1];
- n0 = np[-2];
- d1 = dp[-1];
- d0 = dp[-2];
-
- ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));
-
- if (UNLIKELY (n2 == d1) && n1 == d0)
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
- ASSERT (cy == n2);
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);
-
- if (dn > 2)
- {
- mp_limb_t cy, cy1;
- cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 = (n1 - cy1) & GMP_NUMB_MASK;
- np[-2] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
- qh -= (q == 0);
- q = (q - 1) & GMP_NUMB_MASK;
- }
- }
- else
- np[-2] = n0;
-
- np[-1] = n1;
- }
- qp[0] = q;
- }
- else
- {
- if (qn == 2)
- qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2);
- else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
- qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
- else
- qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
-
- if (qn != dn)
- {
- if (qn > dn - qn)
- mpn_mul (tp, qp, qn, dp - dn, dn - qn);
- else
- mpn_mul (tp, dp - dn, dn - qn, qp, qn);
-
- cy = mpn_sub_n (np - dn, np - dn, tp, dn);
- if (qh != 0)
- cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
-
- while (cy != 0)
- {
- qh -= mpn_sub_1 (qp, qp, qn, 1);
- cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
- }
- }
- }
- qn = nn - dn - qn + 1;
- while (qn > dn)
- {
- qp -= dn;
- np -= dn;
- mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
- qn -= dn;
- }
-
- /* Since we pretended we'd need an extra quotient limb before, we now
- have made sure the code above left just dn-1=qn quotient limbs to
- develop. Develop that plus a guard limb. */
- qn--;
- qp -= qn;
- np -= dn;
- qsave = qp[qn];
- mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp);
- MPN_COPY_INCR (qp, qp + 1, qn);
- qp[qn] = qsave;
- }
- else /* (qn < dn) */
- {
- mp_ptr q2p;
-#if 0 /* not possible since we demand nn > dn */
- if (qn == 0)
- {
- qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
- if (qh)
- mpn_sub_n (np - dn, np - dn, dp - dn, dn);
- TMP_FREE;
- return qh;
- }
-#endif
-
- qp -= qn; /* point at low limb of next quotient block */
- np -= qn; /* point in the middle of partial remainder */
-
- q2p = TMP_SALLOC_LIMBS (qn + 1);
- /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on
- callers not to be silly? */
- if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
- {
- qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
- dp - (qn + 1), qn + 1, dinv->inv32);
- }
- else
- {
- /* It is tempting to use qp for recursive scratch and put quotient in
- tp, but the recursive scratch needs one limb too many. */
- tp = TMP_SALLOC_LIMBS (qn + 1);
- qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp);
- }
- MPN_COPY (qp, q2p + 1, qn);
- }
-
- TMP_FREE;
- return qh;
-}
diff --git a/gmp/mpn/generic/div_q.c b/gmp/mpn/generic/div_q.c
deleted file mode 100644
index aabcef0825..0000000000
--- a/gmp/mpn/generic/div_q.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/* mpn_div_q -- division for arbitrary size operands.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2009, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-/* Compute Q = N/D with truncation.
- N = {np,nn}
- D = {dp,dn}
- Q = {qp,nn-dn+1}
- T = {scratch,nn+1} is scratch space
- N and D are both untouched by the computation.
- N and T may overlap; pass the same space if N is irrelevant after the call,
- but note that tp needs an extra limb.
-
- Operand requirements:
- N >= D > 0
- dp[dn-1] != 0
- No overlap between the N, D, and Q areas.
-
- This division function does not clobber its input operands, since it is
- intended to support average-O(qn) division, and for that to be effective, it
- cannot put requirements on callers to copy a O(nn) operand.
-
- If a caller does not care about the value of {np,nn+1} after calling this
- function, it should pass np also for the scratch argument. This function
- will then save some time and space by avoiding allocation and copying.
- (FIXME: Is this a good design? We only really save any copying for
- already-normalised divisors, which should be rare. It also prevents us from
- reasonably asking for all scratch space we need.)
-
- We write nn-dn+1 limbs for the quotient, but return void. Why not return
- the most significant quotient limb? Look at the 4 main code blocks below
- (consisting of an outer if-else where each arm contains an if-else). It is
- tricky for the first code block, since the mpn_*_div_q calls will typically
- generate all nn-dn+1 and return 0 or 1. I don't see how to fix that unless
- we generate the most significant quotient limb here, before calling
- mpn_*_div_q, or put the quotient in a temporary area. Since this is a
- critical division case (the SB sub-case in particular) copying is not a good
- idea.
-
- It might make sense to split the if-else parts of the (qn + FUDGE
- >= dn) blocks into separate functions, since we could promise quite
- different things to callers in these two cases. The 'then' case
- benefits from np=scratch, and it could perhaps even tolerate qp=np,
- saving some headache for many callers.
-
- FIXME: Scratch allocation leaves a lot to be desired. E.g., for the MU size
- operands, we do not reuse the huge scratch for adjustments. This can be a
- serious waste of memory for the largest operands.
-*/
-
-/* FUDGE determines when to try getting an approximate quotient from the upper
- parts of the dividend and divisor, then adjust. N.B. FUDGE must be >= 2
- for the code to be correct. */
-#define FUDGE 5 /* FIXME: tune this */
-
-#define DC_DIV_Q_THRESHOLD DC_DIVAPPR_Q_THRESHOLD
-#define MU_DIV_Q_THRESHOLD MU_DIVAPPR_Q_THRESHOLD
-#define MUPI_DIV_Q_THRESHOLD MUPI_DIVAPPR_Q_THRESHOLD
-#ifndef MUPI_DIVAPPR_Q_THRESHOLD
-#define MUPI_DIVAPPR_Q_THRESHOLD MUPI_DIV_QR_THRESHOLD
-#endif
-
-void
-mpn_div_q (mp_ptr qp,
- mp_srcptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
-{
- mp_ptr new_dp, new_np, tp, rp;
- mp_limb_t cy, dh, qh;
- mp_size_t new_nn, qn;
- gmp_pi1_t dinv;
- int cnt;
- TMP_DECL;
- TMP_MARK;
-
- ASSERT (nn >= dn);
- ASSERT (dn > 0);
- ASSERT (dp[dn - 1] != 0);
- ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
- ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));
- ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn));
-
- ASSERT_ALWAYS (FUDGE >= 2);
-
- if (dn == 1)
- {
- mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]);
- return;
- }
-
- qn = nn - dn + 1; /* Quotient size, high limb might be zero */
-
- if (qn + FUDGE >= dn)
- {
- /* |________________________|
- |_______| */
- new_np = scratch;
-
- dh = dp[dn - 1];
- if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
- {
- count_leading_zeros (cnt, dh);
-
- cy = mpn_lshift (new_np, np, nn, cnt);
- new_np[nn] = cy;
- new_nn = nn + (cy != 0);
-
- new_dp = TMP_ALLOC_LIMBS (dn);
- mpn_lshift (new_dp, dp, dn, cnt);
-
- if (dn == 2)
- {
- qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
- }
- else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
- BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
- {
- invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
- qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32);
- }
- else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */
- BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
- (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
- + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */
- {
- invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
- qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv);
- }
- else
- {
- mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch);
- }
- if (cy == 0)
- qp[qn - 1] = qh;
- else if (UNLIKELY (qh != 0))
- {
- /* This happens only when the quotient is close to B^n and
- mpn_*_divappr_q returned B^n. */
- mp_size_t i, n;
- n = new_nn - dn;
- for (i = 0; i < n; i++)
- qp[i] = GMP_NUMB_MAX;
- qh = 0; /* currently ignored */
- }
- }
- else /* divisor is already normalised */
- {
- if (new_np != np)
- MPN_COPY (new_np, np, nn);
-
- if (dn == 2)
- {
- qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
- }
- else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
- BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
- {
- invert_pi1 (dinv, dh, dp[dn - 2]);
- qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32);
- }
- else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */
- BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
- (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
- + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */
- {
- invert_pi1 (dinv, dh, dp[dn - 2]);
- qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv);
- }
- else
- {
- mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch);
- }
- qp[nn - dn] = qh;
- }
- }
- else
- {
- /* |________________________|
- |_________________| */
- tp = TMP_ALLOC_LIMBS (qn + 1);
-
- new_np = scratch;
- new_nn = 2 * qn + 1;
- if (new_np == np)
- /* We need {np,nn} to remain untouched until the final adjustment, so
- we need to allocate separate space for new_np. */
- new_np = TMP_ALLOC_LIMBS (new_nn + 1);
-
-
- dh = dp[dn - 1];
- if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
- {
- count_leading_zeros (cnt, dh);
-
- cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
- new_np[new_nn] = cy;
-
- new_nn += (cy != 0);
-
- new_dp = TMP_ALLOC_LIMBS (qn + 1);
- mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
- new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);
-
- if (qn + 1 == 2)
- {
- qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
- }
- else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
- {
- invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
- qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
- }
- else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
- {
- invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
- qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
- }
- else
- {
- mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
- }
- if (cy == 0)
- tp[qn] = qh;
- else if (UNLIKELY (qh != 0))
- {
- /* This happens only when the quotient is close to B^n and
- mpn_*_divappr_q returned B^n. */
- mp_size_t i, n;
- n = new_nn - (qn + 1);
- for (i = 0; i < n; i++)
- tp[i] = GMP_NUMB_MAX;
- qh = 0; /* currently ignored */
- }
- }
- else /* divisor is already normalised */
- {
- MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless of MU will be used */
-
- new_dp = (mp_ptr) dp + dn - (qn + 1);
-
- if (qn == 2 - 1)
- {
- qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
- }
- else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
- {
- invert_pi1 (dinv, dh, new_dp[qn - 1]);
- qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
- }
- else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
- {
- invert_pi1 (dinv, dh, new_dp[qn - 1]);
- qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
- }
- else
- {
- mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
- }
- tp[qn] = qh;
- }
-
- MPN_COPY (qp, tp + 1, qn);
- if (tp[0] <= 4)
- {
- mp_size_t rn;
-
- rp = TMP_ALLOC_LIMBS (dn + qn);
- mpn_mul (rp, dp, dn, tp + 1, qn);
- rn = dn + qn;
- rn -= rp[rn - 1] == 0;
-
- if (rn > nn || mpn_cmp (np, rp, nn) < 0)
- mpn_decr_u (qp, 1);
- }
- }
-
- TMP_FREE;
-}
diff --git a/gmp/mpn/generic/div_qr_1.c b/gmp/mpn/generic/div_qr_1.c
deleted file mode 100644
index 09401ac535..0000000000
--- a/gmp/mpn/generic/div_qr_1.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/* mpn_div_qr_1 -- mpn by limb division.
-
- Contributed to the GNU project by Niels Möller and Torbjörn Granlund
-
-Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003, 2013 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#ifndef DIV_QR_1_NORM_THRESHOLD
-#define DIV_QR_1_NORM_THRESHOLD 3
-#endif
-#ifndef DIV_QR_1_UNNORM_THRESHOLD
-#define DIV_QR_1_UNNORM_THRESHOLD 3
-#endif
-
-#if GMP_NAIL_BITS > 0
-#error Nail bits not supported
-#endif
-
-/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp,
- * n-1}, and the high quote limb at *qh. Returns remainder. */
-mp_limb_t
-mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n,
- mp_limb_t d)
-{
- unsigned cnt;
- mp_limb_t uh;
-
- ASSERT (n > 0);
- ASSERT (d > 0);
-
- if (d & GMP_NUMB_HIGHBIT)
- {
- /* Normalized case */
- mp_limb_t dinv, q;
-
- uh = up[--n];
-
- q = (uh >= d);
- *qh = q;
- uh -= (-q) & d;
-
- if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD))
- {
- cnt = 0;
- plain:
- while (n > 0)
- {
- mp_limb_t ul = up[--n];
- udiv_qrnnd (qp[n], uh, uh, ul, d);
- }
- return uh >> cnt;
- }
- invert_limb (dinv, d);
- return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv);
- }
- else
- {
- /* Unnormalized case */
- mp_limb_t dinv, ul;
-
- if (! UDIV_NEEDS_NORMALIZATION
- && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD))
- {
- uh = up[--n];
- udiv_qrnnd (*qh, uh, CNST_LIMB(0), uh, d);
- cnt = 0;
- goto plain;
- }
-
- count_leading_zeros (cnt, d);
- d <<= cnt;
-
-#if HAVE_NATIVE_div_qr_1u_pi1
- /* FIXME: Call loop doing on-the-fly normalization */
-#endif
-
- /* Shift up front, use qp area for shifted copy. A bit messy,
- since we have only n-1 limbs available, and shift the high
- limb manually. */
- uh = up[--n];
- ul = (uh << cnt) | mpn_lshift (qp, up, n, cnt);
- uh >>= (GMP_LIMB_BITS - cnt);
-
- if (UDIV_NEEDS_NORMALIZATION
- && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD))
- {
- udiv_qrnnd (*qh, uh, uh, ul, d);
- up = qp;
- goto plain;
- }
- invert_limb (dinv, d);
-
- udiv_qrnnd_preinv (*qh, uh, uh, ul, d, dinv);
- return mpn_div_qr_1n_pi1 (qp, qp, n, uh, d, dinv) >> cnt;
- }
-}
diff --git a/gmp/mpn/generic/div_qr_1n_pi1.c b/gmp/mpn/generic/div_qr_1n_pi1.c
deleted file mode 100644
index 229ee091a4..0000000000
--- a/gmp/mpn/generic/div_qr_1n_pi1.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/* mpn_div_qr_1n_pi1
-
- Contributed to the GNU project by Niels Möller
-
- THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#if GMP_NAIL_BITS > 0
-#error Nail bits not supported
-#endif
-
-#ifndef DIV_QR_1N_METHOD
-#define DIV_QR_1N_METHOD 2
-#endif
-
-/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
-
-#if defined (__GNUC__)
-
-#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add %6, %k2\n\t" \
- "adc %4, %k1\n\t" \
- "sbb %k0, %k0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
- "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add %6, %q2\n\t" \
- "adc %4, %q1\n\t" \
- "sbb %q0, %q0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
-#endif
-
-#if defined (__sparc__) && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addxcc %r3, %4, %1\n\t" \
- "subx %%g0, %%g0, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
- __CLOBBER_CC)
-#endif
-
-#if defined (__sparc__) && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addccc %r7, %8, %%g0\n\t" \
- "addccc %r3, %4, %1\n\t" \
- "clr %0\n\t" \
- "movcs %%xcc, -1, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
- "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
- __CLOBBER_CC)
-#if __VIS__ >= 0x300
-#undef add_mssaaaa
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addxccc %r3, %4, %1\n\t" \
- "clr %0\n\t" \
- "movcs %%xcc, -1, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
- __CLOBBER_CC)
-#endif
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
-/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
- processor running in 32-bit mode, since the carry flag then gets the 32-bit
- carry. */
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add%I6c %2, %5, %6\n\t" \
- "adde %1, %3, %4\n\t" \
- "subfe %0, %0, %0\n\t" \
- "nor %0, %0, %0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
-#endif
-
-#if defined (__s390x__) && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "algr %2, %6\n\t" \
- "alcgr %1, %4\n\t" \
- "lghi %0, 0\n\t" \
- "alcgr %0, %0\n\t" \
- "lcgr %0, %0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
-#endif
-
-#if defined (__arm__) && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "adds %2, %5, %6\n\t" \
- "adcs %1, %3, %4\n\t" \
- "movcc %0, #0\n\t" \
- "movcs %0, #-1" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
-#endif
-#endif /* defined (__GNUC__) */
-
-#ifndef add_mssaaaa
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- do { \
- UWtype __s0, __s1, __c0, __c1; \
- __s0 = (a0) + (b0); \
- __s1 = (a1) + (b1); \
- __c0 = __s0 < (a0); \
- __c1 = __s1 < (a1); \
- (s0) = __s0; \
- __s1 = __s1 + __c0; \
- (s1) = __s1; \
- (m) = - (__c1 + (__s1 < __c0)); \
- } while (0)
-#endif
-
-#if DIV_QR_1N_METHOD == 1
-
-/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
- Requires that uh < d. */
-mp_limb_t
-mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
- mp_limb_t d, mp_limb_t dinv)
-{
- ASSERT (n > 0);
- ASSERT (uh < d);
- ASSERT (d & GMP_NUMB_HIGHBIT);
- ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
-
- do
- {
- mp_limb_t q, ul;
-
- ul = up[--n];
- udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
- qp[n] = q;
- }
- while (n > 0);
-
- return uh;
-}
-
-#elif DIV_QR_1N_METHOD == 2
-
-mp_limb_t
-mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
- mp_limb_t d, mp_limb_t dinv)
-{
- mp_limb_t B2;
- mp_limb_t u0, u2;
- mp_limb_t q0, q1;
- mp_limb_t p0, p1;
- mp_limb_t t;
- mp_size_t j;
-
- ASSERT (d & GMP_LIMB_HIGHBIT);
- ASSERT (n > 0);
- ASSERT (u1 < d);
-
- if (n == 1)
- {
- udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
- return u1;
- }
-
- /* FIXME: Could be precomputed */
- B2 = -d*dinv;
-
- umul_ppmm (q1, q0, dinv, u1);
- umul_ppmm (p1, p0, B2, u1);
- q1 += u1;
- ASSERT (q1 >= u1);
- u0 = up[n-1]; /* Early read, to allow qp == up. */
- qp[n-1] = q1;
-
- add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
-
- /* FIXME: Keep q1 in a variable between iterations, to reduce number
- of memory accesses. */
- for (j = n-2; j-- > 0; )
- {
- mp_limb_t q2, cy;
-
- /* Additions for the q update:
- * +-------+
- * |u1 * v |
- * +---+---+
- * | u1|
- * +---+---+
- * | 1 | v | (conditional on u2)
- * +---+---+
- * | 1 | (conditional on u0 + u2 B2 carry)
- * +---+
- * + | q0|
- * -+---+---+---+
- * | q2| q1| q0|
- * +---+---+---+
- */
- umul_ppmm (p1, t, u1, dinv);
- add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
- add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
- add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
- q0 = t;
-
- umul_ppmm (p1, p0, u1, B2);
- ADDC_LIMB (cy, u0, u0, u2 & B2);
- u0 -= (-cy) & d;
-
- /* Final q update */
- add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
- qp[j+1] = q1;
- MPN_INCR_U (qp+j+2, n-j-2, q2);
-
- add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
- }
-
- q1 = (u2 > 0);
- u1 -= (-q1) & d;
-
- t = (u1 >= d);
- q1 += t;
- u1 -= (-t) & d;
-
- udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
- add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
-
- MPN_INCR_U (qp+1, n-1, q1);
-
- qp[0] = q0;
- return u0;
-}
-
-#else
-#error Unknown DIV_QR_1N_METHOD
-#endif
diff --git a/gmp/mpn/generic/div_qr_1n_pi2.c b/gmp/mpn/generic/div_qr_1n_pi2.c
deleted file mode 100644
index 7ea3410cb6..0000000000
--- a/gmp/mpn/generic/div_qr_1n_pi2.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/* mpn_div_qr_1u_pi2.
-
- THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS
- ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/* ISSUES:
-
- * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
-
- * Are there any problems with generating n quotient limbs in the q area? It
- surely simplifies things.
-
- * Not yet adequately tested.
-*/
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Define some longlong.h-style macros, but for wider operations.
- * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
- carry-out into an additional sum operand.
-*/
-#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
-
-#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((USItype)(s2)), \
- "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
- "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
-#endif
-
-#if defined (__amd64__) && W_TYPE_SIZE == 64
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((UDItype)(s2)), \
- "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
-/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
- processor running in 32-bit mode, since the carry flag then gets the 32-bit
- carry. */
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
-#endif
-
-#endif /* __GNUC__ */
-
-#ifndef add_sssaaaa
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- do { \
- UWtype __s0, __s1, __c0, __c1; \
- __s0 = (a0) + (b0); \
- __s1 = (a1) + (b1); \
- __c0 = __s0 < (a0); \
- __c1 = __s1 < (a1); \
- (s0) = __s0; \
- __s1 = __s1 + __c0; \
- (s1) = __s1; \
- (s2) += __c1 + (__s1 < __c0); \
- } while (0)
-#endif
-
-struct precomp_div_1_pi2
-{
- mp_limb_t dip[2];
- mp_limb_t d;
- int norm_cnt;
-};
-
-mp_limb_t
-mpn_div_qr_1n_pi2 (mp_ptr qp,
- mp_srcptr up, mp_size_t un,
- struct precomp_div_1_pi2 *pd)
-{
- mp_limb_t most_significant_q_limb;
- mp_size_t i;
- mp_limb_t r, u2, u1, u0;
- mp_limb_t d0, di1, di0;
- mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
- mp_limb_t cnd;
-
- ASSERT (un >= 2);
- ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
- ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
- ASSERT_MPN (up, un);
-
-#define q3 q3a
-#define q2 q2b
-#define q1 q1b
-
- up += un - 3;
- r = up[2];
- d0 = pd->d;
-
- most_significant_q_limb = (r >= d0);
- r -= d0 & -most_significant_q_limb;
-
- qp += un - 3;
- qp[2] = most_significant_q_limb;
-
- di1 = pd->dip[1];
- di0 = pd->dip[0];
-
- for (i = un - 3; i >= 0; i -= 2)
- {
- u2 = r;
- u1 = up[1];
- u0 = up[0];
-
- /* Dividend in {r,u1,u0} */
-
- umul_ppmm (q1d,q0d, u1, di0);
- umul_ppmm (q2b,q1b, u1, di1);
- q2b++; /* cannot spill */
- add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
-
- umul_ppmm (q2c,q1c, u2, di0);
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
- umul_ppmm (q3a,q2a, u2, di1);
-
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
-
- q3 += r;
-
- r = u0 - q2 * d0;
-
- cnd = (r >= q1);
- r += d0 & -cnd;
- sub_ddmmss (q3,q2, q3,q2, 0,cnd);
-
- if (UNLIKELY (r >= d0))
- {
- r -= d0;
- add_ssaaaa (q3,q2, q3,q2, 0,1);
- }
-
- qp[0] = q2;
- qp[1] = q3;
-
- up -= 2;
- qp -= 2;
- }
-
- if ((un & 1) == 0)
- {
- u2 = r;
- u1 = up[1];
-
- udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
- qp[1] = q3;
- }
-
- return r;
-
-#undef q3
-#undef q2
-#undef q1
-}
diff --git a/gmp/mpn/generic/div_qr_1u_pi2.c b/gmp/mpn/generic/div_qr_1u_pi2.c
deleted file mode 100644
index 83d66ef29e..0000000000
--- a/gmp/mpn/generic/div_qr_1u_pi2.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/* mpn_div_qr_1u_pi2.
-
- THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS
- ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/* ISSUES:
-
- * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
-
- * Are there any problems with generating n quotient limbs in the q area? It
- surely simplifies things.
-
- * Not yet adequately tested.
-*/
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Define some longlong.h-style macros, but for wider operations.
- * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
- carry-out into an additional sum operand.
-*/
-#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
-
-#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((USItype)(s2)), \
- "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
- "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
-#endif
-
-#if defined (__amd64__) && W_TYPE_SIZE == 64
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((UDItype)(s2)), \
- "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
-/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
- processor running in 32-bit mode, since the carry flag then gets the 32-bit
- carry. */
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
-#endif
-
-#endif /* __GNUC__ */
-
-#ifndef add_sssaaaa
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- do { \
- UWtype __s0, __s1, __c0, __c1; \
- __s0 = (a0) + (b0); \
- __s1 = (a1) + (b1); \
- __c0 = __s0 < (a0); \
- __c1 = __s1 < (a1); \
- (s0) = __s0; \
- __s1 = __s1 + __c0; \
- (s1) = __s1; \
- (s2) += __c1 + (__s1 < __c0); \
- } while (0)
-#endif
-
-struct precomp_div_1_pi2
-{
- mp_limb_t dip[2];
- mp_limb_t d;
- int norm_cnt;
-};
-
-mp_limb_t
-mpn_div_qr_1u_pi2 (mp_ptr qp,
- mp_srcptr up, mp_size_t un,
- struct precomp_div_1_pi2 *pd)
-{
- mp_size_t i;
- mp_limb_t r, u2, u1, u0;
- mp_limb_t d0, di1, di0;
- mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
- mp_limb_t cnd;
- int cnt;
-
- ASSERT (un >= 2);
- ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
- ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
- ASSERT_MPN (up, un);
-
-#define q3 q3a
-#define q2 q2b
-#define q1 q1b
-
- up += un - 3;
- cnt = pd->norm_cnt;
- r = up[2] >> (GMP_NUMB_BITS - cnt);
- d0 = pd->d << cnt;
-
- qp += un - 2;
-
- di1 = pd->dip[1];
- di0 = pd->dip[0];
-
- for (i = un - 3; i >= 0; i -= 2)
- {
- u2 = r;
- u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
- u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
-
- /* Dividend in {r,u1,u0} */
-
- umul_ppmm (q1d,q0d, u1, di0);
- umul_ppmm (q2b,q1b, u1, di1);
- q2b++; /* cannot spill */
- add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
-
- umul_ppmm (q2c,q1c, u2, di0);
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
- umul_ppmm (q3a,q2a, u2, di1);
-
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
-
- q3 += r;
-
- r = u0 - q2 * d0;
-
- cnd = (r >= q1);
- r += d0 & -cnd;
- sub_ddmmss (q3,q2, q3,q2, 0,cnd);
-
- if (UNLIKELY (r >= d0))
- {
- r -= d0;
- add_ssaaaa (q3,q2, q3,q2, 0,1);
- }
-
- qp[0] = q2;
- qp[1] = q3;
-
- up -= 2;
- qp -= 2;
- }
-
- if ((un & 1) != 0)
- {
- u2 = r;
- u1 = (up[2] << cnt);
-
- udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
- qp[1] = q3;
- }
- else
- {
- u2 = r;
- u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
- u0 = (up[1] << cnt);
-
- /* Dividend in {r,u1,u0} */
-
- umul_ppmm (q1d,q0d, u1, di0);
- umul_ppmm (q2b,q1b, u1, di1);
- q2b++; /* cannot spill */
- add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
-
- umul_ppmm (q2c,q1c, u2, di0);
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
- umul_ppmm (q3a,q2a, u2, di1);
-
- add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
-
- q3 += r;
-
- r = u0 - q2 * d0;
-
- cnd = (r >= q1);
- r += d0 & -cnd;
- sub_ddmmss (q3,q2, q3,q2, 0,cnd);
-
- if (UNLIKELY (r >= d0))
- {
- r -= d0;
- add_ssaaaa (q3,q2, q3,q2, 0,1);
- }
-
- qp[0] = q2;
- qp[1] = q3;
- }
-
- return r >> cnt;
-
-#undef q3
-#undef q2
-#undef q1
-}
diff --git a/gmp/mpn/generic/div_qr_2.c b/gmp/mpn/generic/div_qr_2.c
deleted file mode 100644
index cb07e0e3b4..0000000000
--- a/gmp/mpn/generic/div_qr_2.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
- quotient. The divisor is two limbs.
-
- Contributed to the GNU project by Torbjorn Granlund and Niels Möller
-
- THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-
-Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#ifndef DIV_QR_2_PI2_THRESHOLD
-/* Disabled unless explicitly tuned. */
-#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
-#endif
-
-#ifndef SANITY_CHECK
-#define SANITY_CHECK 0
-#endif
-
-/* Define some longlong.h-style macros, but for wider operations.
- * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
- carry-out into an additional sum operand.
- * add_csaac accepts two addends and a carry in, and generates a sum
- and a carry out. A little like a "full adder".
-*/
-#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
-
-#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((USItype)(s2)), \
- "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
- "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
-#define add_csaac(co, s, a, b, ci) \
- __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \
- : "=r" (co), "=r" (s) \
- : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \
- "%1" ((USItype)(a)), "g" ((USItype)(b)))
-#endif
-
-#if defined (__amd64__) && W_TYPE_SIZE == 64
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "0" ((UDItype)(s2)), \
- "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
-#define add_csaac(co, s, a, b, ci) \
- __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \
- : "=r" (co), "=r" (s) \
- : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \
- "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
-/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
- processor running in 32-bit mode, since the carry flag then gets the 32-bit
- carry. */
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
- : "=r" (s2), "=&r" (s1), "=&r" (s0) \
- : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
-#endif
-
-#endif /* __GNUC__ */
-
-#ifndef add_sssaaaa
-#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
- do { \
- UWtype __s0, __s1, __c0, __c1; \
- __s0 = (a0) + (b0); \
- __s1 = (a1) + (b1); \
- __c0 = __s0 < (a0); \
- __c1 = __s1 < (a1); \
- (s0) = __s0; \
- __s1 = __s1 + __c0; \
- (s1) = __s1; \
- (s2) += __c1 + (__s1 < __c0); \
- } while (0)
-#endif
-
-#ifndef add_csaac
-#define add_csaac(co, s, a, b, ci) \
- do { \
- UWtype __s, __c; \
- __s = (a) + (b); \
- __c = __s < (a); \
- __s = __s + (ci); \
- (s) = __s; \
- (co) = __c + (__s < (ci)); \
- } while (0)
-#endif
-
-/* Typically used with r1, r0 same as n3, n2. Other types of overlap
- between inputs and outputs are not supported. */
-#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
- do { \
- mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
- mp_limb_t _t1, _t0; \
- mp_limb_t _c, _mask; \
- \
- umul_ppmm (_q3,_q2a, n3, di1); \
- umul_ppmm (_q2,_q1, n2, di1); \
- umul_ppmm (_q2c,_q1c, n3, di0); \
- add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \
- umul_ppmm (_q1d,_q0, n2, di0); \
- add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \
- \
- add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1)); \
- \
- /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \
- add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0)); \
- add_csaac (_c, _q1, _q1, n1, _c); \
- add_csaac (_c, _q2, _q2, r0, _c); \
- _q3 = _q3 + r1 + _c; \
- \
- umul_ppmm (_t1,_t0, _q2, d0); \
- _t1 += _q2 * d1 + _q3 * d0; \
- \
- sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
- \
- _mask = -(mp_limb_t) (r1 >= _q1 & (r1 > _q1 | r0 >= _q0)); /* (r1,r0) >= (q1,q0) */ \
- add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
- sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
- \
- if (UNLIKELY (r1 >= d1)) \
- { \
- if (r1 > d1 || r0 >= d0) \
- { \
- sub_ddmmss (r1, r0, r1, r0, d1, d0); \
- add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
- } \
- } \
- (q1) = _q3; \
- (q0) = _q2; \
- } while (0)
-
-static void
-invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
-{
- mp_limb_t v1, v0, p1, t1, t0, p0, mask;
- invert_limb (v1, d1);
- p1 = d1 * v1;
- /* <1, v1> * d1 = <B-1, p1> */
- p1 += d0;
- if (p1 < d0)
- {
- v1--;
- mask = -(mp_limb_t) (p1 >= d1);
- p1 -= d1;
- v1 += mask;
- p1 -= mask & d1;
- }
- /* <1, v1> * d1 + d0 = <B-1, p1> */
- umul_ppmm (t1, p0, d0, v1);
- p1 += t1;
- if (p1 < t1)
- {
- if (UNLIKELY (p1 >= d1))
- {
- if (p1 > d1 || p0 >= d0)
- {
- sub_ddmmss (p1, p0, p1, p0, d1, d0);
- v1--;
- }
- }
- sub_ddmmss (p1, p0, p1, p0, d1, d0);
- v1--;
- }
- /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
- * with <p1, p0> + <d1, d0> >= B^2.
- *
- * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
- * partial remainder after <1, v1> is
- *
- * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
- * = <~p1, ~p0, B-1>
- */
- udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
- di[0] = v0;
- di[1] = v1;
-
-#if SANITY_CHECK
- {
- mp_limb_t tp[4];
- mp_limb_t dp[2];
- dp[0] = d0;
- dp[1] = d1;
- mpn_mul_n (tp, dp, di, 2);
- ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
- ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
- ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
- ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
- }
-#endif
-}
-
-static mp_limb_t
-mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
- mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
-{
- mp_limb_t qh;
- mp_size_t i;
- mp_limb_t r1, r0;
-
- ASSERT (nn >= 2);
- ASSERT (d1 & GMP_NUMB_HIGHBIT);
-
- r1 = np[nn-1];
- r0 = np[nn-2];
-
- qh = 0;
- if (r1 >= d1 && (r1 > d1 || r0 >= d0))
- {
-#if GMP_NAIL_BITS == 0
- sub_ddmmss (r1, r0, r1, r0, d1, d0);
-#else
- r0 = r0 - d0;
- r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
- r0 &= GMP_NUMB_MASK;
-#endif
- qh = 1;
- }
-
- for (i = nn - 2; i >= 2; i -= 2)
- {
- mp_limb_t n1, n0, q1, q0;
- n1 = np[i-1];
- n0 = np[i-2];
- udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
- qp[i-1] = q1;
- qp[i-2] = q0;
- }
-
- if (i > 0)
- {
- mp_limb_t q;
- udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
- qp[0] = q;
- }
- rp[1] = r1;
- rp[0] = r0;
-
- return qh;
-}
-
-
-/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
- significant quotient limbs at qp and the 2 long remainder at np.
- Return the most significant limb of the quotient.
-
- Preconditions:
- 1. qp must either not overlap with the input operands at all, or
- qp >= np + 2 must hold true. (This means that it's possible to put
- the quotient in the high part of {np,nn}, right above the remainder.
- 2. nn >= 2. */
-
-mp_limb_t
-mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
- mp_srcptr dp)
-{
- mp_limb_t d1;
- mp_limb_t d0;
- gmp_pi1_t dinv;
-
- ASSERT (nn >= 2);
- ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
- ASSERT_MPN (np, nn);
- ASSERT_MPN (dp, 2);
-
- d1 = dp[1]; d0 = dp[0];
-
- ASSERT (d1 > 0);
-
- if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
- {
- if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
- {
- gmp_pi1_t dinv;
- invert_pi1 (dinv, d1, d0);
- return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
- }
- else
- {
- mp_limb_t di[2];
- invert_4by2 (di, d1, d0);
- return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
- }
- }
- else
- {
- int shift;
- count_leading_zeros (shift, d1);
- d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
- d0 <<= shift;
- invert_pi1 (dinv, d1, d0);
- return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
- }
-}
diff --git a/gmp/mpn/generic/div_qr_2n_pi1.c b/gmp/mpn/generic/div_qr_2n_pi1.c
deleted file mode 100644
index da500e2170..0000000000
--- a/gmp/mpn/generic/div_qr_2n_pi1.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* mpn_div_qr_2n_pi1
-
- Contributed to the GNU project by Torbjorn Granlund and Niels Möller
-
- THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-
-Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-/* 3/2 loop, for normalized divisor */
-mp_limb_t
-mpn_div_qr_2n_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
- mp_limb_t d1, mp_limb_t d0, mp_limb_t di)
-{
- mp_limb_t qh;
- mp_size_t i;
- mp_limb_t r1, r0;
-
- ASSERT (nn >= 2);
- ASSERT (d1 & GMP_NUMB_HIGHBIT);
-
- np += nn - 2;
- r1 = np[1];
- r0 = np[0];
-
- qh = 0;
- if (r1 >= d1 && (r1 > d1 || r0 >= d0))
- {
-#if GMP_NAIL_BITS == 0
- sub_ddmmss (r1, r0, r1, r0, d1, d0);
-#else
- r0 = r0 - d0;
- r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
- r0 &= GMP_NUMB_MASK;
-#endif
- qh = 1;
- }
-
- for (i = nn - 2 - 1; i >= 0; i--)
- {
- mp_limb_t n0, q;
- n0 = np[-1];
- udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di);
- np--;
- qp[i] = q;
- }
-
- rp[1] = r1;
- rp[0] = r0;
-
- return qh;
-}
diff --git a/gmp/mpn/generic/div_qr_2u_pi1.c b/gmp/mpn/generic/div_qr_2u_pi1.c
deleted file mode 100644
index 0b9ddf5753..0000000000
--- a/gmp/mpn/generic/div_qr_2u_pi1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/* mpn_div_qr_2u_pi1
-
- Contributed to the GNU project by Niels Möller
-
- THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-/* 3/2 loop, for unnormalized divisor. Caller must pass shifted d1 and
- d0, while {np,nn} is shifted on the fly. */
-mp_limb_t
-mpn_div_qr_2u_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
- mp_limb_t d1, mp_limb_t d0, int shift, mp_limb_t di)
-{
- mp_limb_t qh;
- mp_limb_t r2, r1, r0;
- mp_size_t i;
-
- ASSERT (nn >= 2);
- ASSERT (d1 & GMP_NUMB_HIGHBIT);
- ASSERT (shift > 0);
-
- r2 = np[nn-1] >> (GMP_LIMB_BITS - shift);
- r1 = (np[nn-1] << shift) | (np[nn-2] >> (GMP_LIMB_BITS - shift));
- r0 = np[nn-2] << shift;
-
- udiv_qr_3by2 (qh, r2, r1, r2, r1, r0, d1, d0, di);
-
- for (i = nn - 2 - 1; i >= 0; i--)
- {
- mp_limb_t q;
- r0 = np[i];
- r1 |= r0 >> (GMP_LIMB_BITS - shift);
- r0 <<= shift;
- udiv_qr_3by2 (q, r2, r1, r2, r1, r0, d1, d0, di);
- qp[i] = q;
- }
-
- rp[0] = (r1 >> shift) | (r2 << (GMP_LIMB_BITS - shift));
- rp[1] = r2 >> shift;
-
- return qh;
-}
diff --git a/gmp/mpn/generic/dive_1.c b/gmp/mpn/generic/dive_1.c
index 1c0a4e894d..27df57b80e 100644
--- a/gmp/mpn/generic/dive_1.c
+++ b/gmp/mpn/generic/dive_1.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2003, 2005, 2013 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -41,7 +30,7 @@ see https://www.gnu.org/licenses/. */
/* Divide a={src,size} by d=divisor and store the quotient in q={dst,size}.
q will only be correct if d divides a exactly.
- A separate loop is used for shift==0 because n<<GMP_LIMB_BITS doesn't
+ A separate loop is used for shift==0 because n<<BITS_PER_MP_LIMB doesn't
give zero on all CPUs (for instance it doesn't on the x86s). This
separate loop might run faster too, helping odd divisors.
@@ -61,7 +50,7 @@ see https://www.gnu.org/licenses/. */
faster on some CPUs and would mean just the shift==0 style loop would be
needed.
- If n<<GMP_LIMB_BITS gives zero on a particular CPU then the separate
+ If n<<BITS_PER_MP_LIMB gives zero on a particular CPU then the separate
shift==0 loop is unnecessary, and could be eliminated if there's no great
speed difference.
@@ -87,6 +76,14 @@ mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
ASSERT_MPN (src, size);
ASSERT_LIMB (divisor);
+ s = src[0];
+
+ if (size == 1)
+ {
+ dst[0] = s / divisor;
+ return;
+ }
+
if ((divisor & 1) == 0)
{
count_trailing_zeros (shift, divisor);
@@ -101,39 +98,40 @@ mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
if (shift != 0)
{
c = 0;
+ i = 0;
+ size--;
- s = src[0];
-
- for (i = 1; i < size; i++)
+ do
{
- s_next = src[i];
+ s_next = src[i+1];
ls = ((s >> shift) | (s_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK;
s = s_next;
SUBC_LIMB (c, l, ls, c);
l = (l * inverse) & GMP_NUMB_MASK;
- dst[i - 1] = l;
+ dst[i] = l;
umul_ppmm (h, dummy, l, divisor);
c += h;
+
+ i++;
}
while (i < size);
ls = s >> shift;
l = ls - c;
l = (l * inverse) & GMP_NUMB_MASK;
- dst[size - 1] = l;
+ dst[i] = l;
}
else
{
- s = src[0];
-
l = (s * inverse) & GMP_NUMB_MASK;
dst[0] = l;
+ i = 1;
c = 0;
- for (i = 1; i < size; i++)
+ do
{
umul_ppmm (h, dummy, l, divisor);
c += h;
@@ -143,6 +141,8 @@ mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
l = (l * inverse) & GMP_NUMB_MASK;
dst[i] = l;
+ i++;
}
+ while (i < size);
}
}
diff --git a/gmp/mpn/generic/diveby3.c b/gmp/mpn/generic/diveby3.c
index 2ffd9fe777..6293f65a89 100644
--- a/gmp/mpn/generic/diveby3.c
+++ b/gmp/mpn/generic/diveby3.c
@@ -1,32 +1,21 @@
/* mpn_divexact_by3c -- mpn exact division by 3.
-Copyright 2000-2003, 2008 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/divexact.c b/gmp/mpn/generic/divexact.c
index 47a47e3d80..a0e439cbee 100644
--- a/gmp/mpn/generic/divexact.c
+++ b/gmp/mpn/generic/divexact.c
@@ -4,104 +4,28 @@
Contributed to the GNU project by Torbjorn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
+Copyright 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#if 1
-void
-mpn_divexact (mp_ptr qp,
- mp_srcptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn)
-{
- unsigned shift;
- mp_size_t qn;
- mp_ptr tp;
- TMP_DECL;
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
- ASSERT (dn > 0);
- ASSERT (nn >= dn);
- ASSERT (dp[dn-1] > 0);
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- while (dp[0] == 0)
- {
- ASSERT (np[0] == 0);
- dp++;
- np++;
- dn--;
- nn--;
- }
-
- if (dn == 1)
- {
- MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]);
- return;
- }
-
- TMP_MARK;
-
- qn = nn + 1 - dn;
- count_trailing_zeros (shift, dp[0]);
-
- if (shift > 0)
- {
- mp_ptr wp;
- mp_size_t ss;
- ss = (dn > qn) ? qn + 1 : dn;
-
- tp = TMP_ALLOC_LIMBS (ss);
- mpn_rshift (tp, dp, ss, shift);
- dp = tp;
-
- /* Since we have excluded dn == 1, we have nn > qn, and we need
- to shift one limb beyond qn. */
- wp = TMP_ALLOC_LIMBS (qn + 1);
- mpn_rshift (wp, np, qn + 1, shift);
- np = wp;
- }
-
- if (dn > qn)
- dn = qn;
-
- tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn));
- mpn_bdiv_q (qp, np, qn, dp, dn, tp);
- TMP_FREE;
-}
-
-#else
/* We use the Jebelean's bidirectional exact division algorithm. This is
somewhat naively implemented, with equal quotient parts done by 2-adic
@@ -120,8 +44,17 @@ mpn_divexact (mp_ptr qp,
* It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of
that the latter is faster. We should at least reverse this, but perhaps
we should make the lsb part considerably larger. (How do we tune this?)
+
+ Perhaps we could somehow use 2-adic division for both parts, not as now
+ truncating division for the upper part and 2-adic for the lower part.
*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
mp_size_t
mpn_divexact_itch (mp_size_t nn, mp_size_t dn)
{
@@ -143,8 +76,7 @@ mpn_divexact (mp_ptr qp,
int cnt;
mp_ptr xdp;
mp_limb_t di;
- mp_limb_t cy;
- gmp_pi1_t dinv;
+ mp_limb_t dip[2], xp[2], cy;
TMP_DECL;
TMP_MARK;
@@ -158,7 +90,7 @@ mpn_divexact (mp_ptr qp,
MPN_COPY (tp, np, qn);
binvert_limb (di, dp[0]); di = -di;
dn = MIN (dn, qn);
- mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+ mpn_sb_bdiv_q (qp, tp, qn, dp, dn, di);
TMP_FREE;
return;
}
@@ -175,14 +107,14 @@ mpn_divexact (mp_ptr qp,
MPN_COPY (tp, np, qn);
binvert_limb (di, dp[0]); di = -di;
dn = MIN (dn, qn);
- mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+ mpn_sb_bdiv_q (qp, tp, qn, dp, dn, di);
}
else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
{
tp = scratch;
MPN_COPY (tp, np, qn);
binvert_limb (di, dp[0]); di = -di;
- mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+ mpn_dc_bdiv_q (qp, tp, qn, dp, dn, di);
}
else
{
@@ -248,14 +180,23 @@ mpn_divexact (mp_ptr qp,
MPN_COPY (tp, np + nn - nn1, nn1);
}
- invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]);
if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD))
{
- qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32);
+ /* Compute divisor inverse. */
+ cy = mpn_add_1 (xp, xdp + qn1 - 2, 2, 1);
+ if (cy != 0)
+ dip[0] = dip[1] = 0;
+ else
+ {
+ mp_limb_t scratch[10]; /* FIXME */
+ mpn_invert (dip, xp, 2, scratch);
+ }
+
+ qp[qn0 - 1 + nn1 - qn1] = mpn_sb_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dip);
}
else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD))
{
- qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv);
+ qp[qn0 - 1 + nn1 - qn1] = mpn_dc_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1);
}
else
{
@@ -274,12 +215,12 @@ mpn_divexact (mp_ptr qp,
if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD))
{
MPN_COPY (tp, np, qn0);
- mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
+ mpn_sb_bdiv_q (qp, tp, qn0, dp, qn0, di);
}
else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD))
{
MPN_COPY (tp, np, qn0);
- mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
+ mpn_dc_bdiv_q (qp, tp, qn0, dp, qn0, di);
}
else
{
@@ -291,4 +232,3 @@ mpn_divexact (mp_ptr qp,
TMP_FREE;
}
-#endif
diff --git a/gmp/mpn/generic/divis.c b/gmp/mpn/generic/divis.c
index 9e162e60d2..b05ecd8a78 100644
--- a/gmp/mpn/generic/divis.c
+++ b/gmp/mpn/generic/divis.c
@@ -4,80 +4,86 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2001, 2002, 2005, 2009 Free Software Foundation, Inc.
+Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-/* Determine whether A={ap,an} is divisible by D={dp,dn}. Must have both
- operands normalized, meaning high limbs non-zero, except that an==0 is
+/* Determine whether {ap,asize} is divisible by {dp,dsize}. Must have both
+ operands normalized, meaning high limbs non-zero, except that asize==0 is
allowed.
- There usually won't be many low zero bits on D, but the checks for this
+ There usually won't be many low zero bits on d, but the checks for this
are fast and might pick up a few operand combinations, in particular they
- might reduce D to fit the single-limb mod_1/modexact_1 code.
+ might reduce d to fit the single-limb mod_1/modexact_1 code.
Future:
+ This is currently not much faster than the user doing an mpz_tdiv_r
+ and testing for a zero remainder, but hopefully it can be improved.
+
+ mpn_bdivmod is one possibility, but it only trades udiv_qrnnd's for
+ multiplies, it won't save crossproducts the way it can in mpz_divexact.
+ Definitely worthwhile on small operands for most processors, but a
+ sub-quadratic version will be wanted before it can be used on all sizes.
+
Getting the remainder limb by limb would make an early exit possible on
finding a non-zero. This would probably have to be bdivmod style so
there's no addback, but it would need a multi-precision inverse and so
might be slower than the plain method (on small sizes at least).
- When D must be normalized (shifted to low bit set), it's possible to
- suppress the bit-shifting of A down, as long as it's already been checked
- that A has at least as many trailing zero bits as D. */
+ When d must be normalized (shifted to high bit set), it's possible to
+ just append a low zero limb to "a" rather than bit-shifting as
+ mpn_tdiv_qr does internally, so long as it's already been checked that a
+ has at least as many trailing zeros bits as d. Or equivalently, pass
+ qxn==1 to mpn_tdiv_qr, if/when it accepts that.
+
+ When called from mpz_congruent_p, {ap,asize} is a temporary which can be
+ destroyed. Maybe it'd be possible to get into mpn_tdiv_qr at a lower
+ level to save copying it, or maybe that function could accept rp==ap.
+
+ Could use __attribute__ ((regparm (2))) on i386, so the parameters
+ wouldn't need extra stack when called from mpz_divisible_p, but a
+ pre-release gcc 3 didn't generate particularly good register juggling in
+ that case, so this isn't done for now. */
int
-mpn_divisible_p (mp_srcptr ap, mp_size_t an,
- mp_srcptr dp, mp_size_t dn)
+mpn_divisible_p (mp_srcptr ap, mp_size_t asize,
+ mp_srcptr dp, mp_size_t dsize)
{
mp_limb_t alow, dlow, dmask;
- mp_ptr qp, rp, tp;
+ mp_ptr qp, rp;
mp_size_t i;
- mp_limb_t di;
- unsigned twos;
TMP_DECL;
- ASSERT (an >= 0);
- ASSERT (an == 0 || ap[an-1] != 0);
- ASSERT (dn >= 1);
- ASSERT (dp[dn-1] != 0);
- ASSERT_MPN (ap, an);
- ASSERT_MPN (dp, dn);
+ ASSERT (asize >= 0);
+ ASSERT (asize == 0 || ap[asize-1] != 0);
+ ASSERT (dsize >= 1);
+ ASSERT (dp[dsize-1] != 0);
+ ASSERT_MPN (ap, asize);
+ ASSERT_MPN (dp, dsize);
/* When a<d only a==0 is divisible.
- Notice this test covers all cases of an==0. */
- if (an < dn)
- return (an == 0);
+ Notice this test covers all cases of asize==0. */
+ if (asize < dsize)
+ return (asize == 0);
/* Strip low zero limbs from d, requiring a==0 on those. */
for (;;)
@@ -91,9 +97,9 @@ mpn_divisible_p (mp_srcptr ap, mp_size_t an,
if (alow != 0)
return 0; /* a has fewer low zero limbs than d, so not divisible */
- /* a!=0 and d!=0 so won't get to n==0 */
- an--; ASSERT (an >= 1);
- dn--; ASSERT (dn >= 1);
+ /* a!=0 and d!=0 so won't get to size==0 */
+ asize--; ASSERT (asize >= 1);
+ dsize--; ASSERT (dsize >= 1);
ap++;
dp++;
}
@@ -103,88 +109,41 @@ mpn_divisible_p (mp_srcptr ap, mp_size_t an,
if ((alow & dmask) != 0)
return 0;
- if (dn == 1)
+ if (dsize == 1)
{
- if (ABOVE_THRESHOLD (an, BMOD_1_TO_MOD_1_THRESHOLD))
- return mpn_mod_1 (ap, an, dlow) == 0;
+ if (BELOW_THRESHOLD (asize, MODEXACT_1_ODD_THRESHOLD))
+ return mpn_mod_1 (ap, asize, dlow) == 0;
- count_trailing_zeros (twos, dlow);
- dlow >>= twos;
- return mpn_modexact_1_odd (ap, an, dlow) == 0;
+ if ((dlow & 1) == 0)
+ {
+ unsigned twos;
+ count_trailing_zeros (twos, dlow);
+ dlow >>= twos;
+ }
+ return mpn_modexact_1_odd (ap, asize, dlow) == 0;
}
- if (dn == 2)
+ if (dsize == 2)
{
mp_limb_t dsecond = dp[1];
if (dsecond <= dmask)
{
+ unsigned twos;
count_trailing_zeros (twos, dlow);
dlow = (dlow >> twos) | (dsecond << (GMP_NUMB_BITS-twos));
ASSERT_LIMB (dlow);
- return MPN_MOD_OR_MODEXACT_1_ODD (ap, an, dlow) == 0;
+ return MPN_MOD_OR_MODEXACT_1_ODD (ap, asize, dlow) == 0;
}
}
- /* Should we compute Q = A * D^(-1) mod B^k,
- R = A - Q * D mod B^k
- here, for some small values of k? Then check if R = 0 (mod B^k). */
-
- /* We could also compute A' = A mod T and D' = D mod P, for some
- P = 3 * 5 * 7 * 11 ..., and then check if any prime factor from P
- dividing D' also divides A'. */
-
TMP_MARK;
- rp = TMP_ALLOC_LIMBS (an + 1);
- qp = TMP_ALLOC_LIMBS (an - dn + 1); /* FIXME: Could we avoid this? */
-
- count_trailing_zeros (twos, dp[0]);
-
- if (twos != 0)
- {
- tp = TMP_ALLOC_LIMBS (dn);
- ASSERT_NOCARRY (mpn_rshift (tp, dp, dn, twos));
- dp = tp;
+ rp = TMP_ALLOC_LIMBS (asize+1);
+ qp = rp + dsize;
- ASSERT_NOCARRY (mpn_rshift (rp, ap, an, twos));
- }
- else
- {
- MPN_COPY (rp, ap, an);
- }
- if (rp[an - 1] >= dp[dn - 1])
- {
- rp[an] = 0;
- an++;
- }
- else if (an == dn)
- {
- TMP_FREE;
- return 0;
- }
-
- ASSERT (an > dn); /* requirement of functions below */
-
- if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) ||
- BELOW_THRESHOLD (an - dn, DC_BDIV_QR_THRESHOLD))
- {
- binvert_limb (di, dp[0]);
- mpn_sbpi1_bdiv_qr (qp, rp, an, dp, dn, -di);
- rp += an - dn;
- }
- else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
- {
- binvert_limb (di, dp[0]);
- mpn_dcpi1_bdiv_qr (qp, rp, an, dp, dn, -di);
- rp += an - dn;
- }
- else
- {
- tp = TMP_ALLOC_LIMBS (mpn_mu_bdiv_qr_itch (an, dn));
- mpn_mu_bdiv_qr (qp, rp, rp, an, dp, dn, tp);
- }
+ mpn_tdiv_qr (qp, rp, (mp_size_t) 0, ap, asize, dp, dsize);
- /* test for {rp,dn} zero or non-zero */
+ /* test for {rp,dsize} zero or non-zero */
i = 0;
do
{
@@ -194,7 +153,7 @@ mpn_divisible_p (mp_srcptr ap, mp_size_t an,
return 0;
}
}
- while (++i < dn);
+ while (++i < dsize);
TMP_FREE;
return 1;
diff --git a/gmp/mpn/generic/divrem.c b/gmp/mpn/generic/divrem.c
index f420992746..999ffdd347 100644
--- a/gmp/mpn/generic/divrem.c
+++ b/gmp/mpn/generic/divrem.c
@@ -1,33 +1,24 @@
/* mpn_divrem -- Divide natural numbers, producing both remainder and
- quotient. This is now just a middle layer calling mpn_tdiv_qr.
+ quotient. This is now just a middle layer for calling the new
+ internal mpn_tdiv_qr.
-Copyright 1993-1997, 1999-2002, 2005 Free Software Foundation, Inc.
+Copyright 1993, 1994, 1995, 1996, 1997, 1999, 2000, 2001, 2002, 2005 Free
+Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -56,7 +47,7 @@ mpn_divrem (mp_ptr qp, mp_size_t qxn,
TMP_DECL;
TMP_MARK;
- q2p = TMP_ALLOC_LIMBS (nn + qxn);
+ q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
qn = nn + qxn - 1;
@@ -81,11 +72,11 @@ mpn_divrem (mp_ptr qp, mp_size_t qxn,
if (UNLIKELY (qxn != 0))
{
mp_ptr n2p;
- n2p = TMP_ALLOC_LIMBS (nn + qxn);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
MPN_ZERO (n2p, qxn);
MPN_COPY (n2p + qxn, np, nn);
- q2p = TMP_ALLOC_LIMBS (nn - dn + qxn + 1);
- rp = TMP_ALLOC_LIMBS (dn);
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
MPN_COPY (np, rp, dn);
qn = nn - dn + qxn;
@@ -94,8 +85,8 @@ mpn_divrem (mp_ptr qp, mp_size_t qxn,
}
else
{
- q2p = TMP_ALLOC_LIMBS (nn - dn + 1);
- rp = TMP_ALLOC_LIMBS (dn);
+ q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+ rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
MPN_COPY (np, rp, dn); /* overwrite np area with remainder */
qn = nn - dn;
diff --git a/gmp/mpn/generic/divrem_1.c b/gmp/mpn/generic/divrem_1.c
index 9157b5735e..c416946294 100644
--- a/gmp/mpn/generic/divrem_1.c
+++ b/gmp/mpn/generic/divrem_1.c
@@ -1,33 +1,22 @@
/* mpn_divrem_1 -- mpn by limb division.
-Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003 Free Software
+Copyright 1991, 1993, 1994, 1996, 1998, 1999, 2000, 2002, 2003 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -167,7 +156,7 @@ mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
else
{
/* Most significant bit of divisor == 0. */
- int cnt;
+ int norm;
/* Skip a division if high < divisor (high quotient 0). Testing here
before normalizing will still skip as often as possible. */
@@ -189,28 +178,28 @@ mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
&& BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD))
goto plain;
- count_leading_zeros (cnt, d);
- d <<= cnt;
- r <<= cnt;
+ count_leading_zeros (norm, d);
+ d <<= norm;
+ r <<= norm;
if (UDIV_NEEDS_NORMALIZATION
&& BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD))
{
- mp_limb_t nshift;
if (un != 0)
{
n1 = up[un - 1] << GMP_NAIL_BITS;
- r |= (n1 >> (GMP_LIMB_BITS - cnt));
+ r |= (n1 >> (GMP_LIMB_BITS - norm));
for (i = un - 2; i >= 0; i--)
{
n0 = up[i] << GMP_NAIL_BITS;
- nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
- udiv_qrnnd (*qp, r, r, nshift, d);
+ udiv_qrnnd (*qp, r, r,
+ (n1 << norm) | (n0 >> (GMP_NUMB_BITS - norm)),
+ d);
r >>= GMP_NAIL_BITS;
qp--;
n1 = n0;
}
- udiv_qrnnd (*qp, r, r, n1 << cnt, d);
+ udiv_qrnnd (*qp, r, r, n1 << norm, d);
r >>= GMP_NAIL_BITS;
qp--;
}
@@ -220,26 +209,27 @@ mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
r >>= GMP_NAIL_BITS;
qp--;
}
- return r >> cnt;
+ return r >> norm;
}
else
{
- mp_limb_t dinv, nshift;
+ mp_limb_t dinv;
invert_limb (dinv, d);
if (un != 0)
{
n1 = up[un - 1] << GMP_NAIL_BITS;
- r |= (n1 >> (GMP_LIMB_BITS - cnt));
+ r |= (n1 >> (GMP_LIMB_BITS - norm));
for (i = un - 2; i >= 0; i--)
{
n0 = up[i] << GMP_NAIL_BITS;
- nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
- udiv_qrnnd_preinv (*qp, r, r, nshift, d, dinv);
+ udiv_qrnnd_preinv (*qp, r, r,
+ ((n1 << norm) | (n0 >> (GMP_NUMB_BITS - norm))),
+ d, dinv);
r >>= GMP_NAIL_BITS;
qp--;
n1 = n0;
}
- udiv_qrnnd_preinv (*qp, r, r, n1 << cnt, d, dinv);
+ udiv_qrnnd_preinv (*qp, r, r, n1 << norm, d, dinv);
r >>= GMP_NAIL_BITS;
qp--;
}
@@ -249,7 +239,7 @@ mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
r >>= GMP_NAIL_BITS;
qp--;
}
- return r >> cnt;
+ return r >> norm;
}
}
}
diff --git a/gmp/mpn/generic/divrem_2.c b/gmp/mpn/generic/divrem_2.c
index 30d24bb102..ba761dc36c 100644
--- a/gmp/mpn/generic/divrem_2.c
+++ b/gmp/mpn/generic/divrem_2.c
@@ -1,119 +1,179 @@
/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
quotient. The divisor is two limbs.
- THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+ THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+ RELEASE.
-Copyright 1993-1996, 1999-2002 Free Software Foundation, Inc.
+Copyright 1993, 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-/* Divide num {np,nn} by den {dp,2} and write the nn-2 least significant
- quotient limbs at qp and the 2 long remainder at np. If qxn is non-zero,
- generate that many fraction bits and append them after the other quotient
- limbs. Return the most significant limb of the quotient, this is always 0
- or 1.
+/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd,
+ meaning the quotient size where that should happen, the quotient size
+ being how many udiv divisions will be done.
+
+ The default is to use preinv always, CPUs where this doesn't suit have
+ tuned thresholds. Note in particular that preinv should certainly be
+ used if that's the only division available (USE_PREINV_ALWAYS). */
+
+#ifndef DIVREM_2_THRESHOLD
+#define DIVREM_2_THRESHOLD 0
+#endif
+
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+ the NSIZE-2 least significant quotient limbs at QP
+ and the 2 long remainder at NP. If QEXTRA_LIMBS is
+ non-zero, generate that many fraction bits and append them after the
+ other quotient limbs.
+ Return the most significant limb of the quotient, this is always 0 or 1.
Preconditions:
+ 0. NSIZE >= 2.
1. The most significant bit of the divisor must be set.
- 2. qp must either not overlap with the input operands at all, or
- qp >= np + 2 must hold true. (This means that it's possible to put
- the quotient in the high part of {np,nn}, right above the remainder.
- 3. nn >= 2, even if qxn is non-zero. */
+ 2. QP must either not overlap with the input operands at all, or
+ QP + 2 >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */
mp_limb_t
mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
mp_ptr np, mp_size_t nn,
mp_srcptr dp)
{
- mp_limb_t most_significant_q_limb;
+ mp_limb_t most_significant_q_limb = 0;
mp_size_t i;
- mp_limb_t r1, r0, d1, d0;
- gmp_pi1_t di;
+ mp_limb_t n1, n0, n2;
+ mp_limb_t d1, d0;
+ mp_limb_t d1inv;
+ int use_preinv;
ASSERT (nn >= 2);
ASSERT (qxn >= 0);
ASSERT (dp[1] & GMP_NUMB_HIGHBIT);
- ASSERT (! MPN_OVERLAP_P (qp, nn-2+qxn, np, nn) || qp >= np+2);
+ ASSERT (! MPN_OVERLAP_P (qp, nn-2+qxn, np, nn) || qp+2 >= np);
ASSERT_MPN (np, nn);
ASSERT_MPN (dp, 2);
np += nn - 2;
d1 = dp[1];
d0 = dp[0];
- r1 = np[1];
- r0 = np[0];
+ n1 = np[1];
+ n0 = np[0];
- most_significant_q_limb = 0;
- if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+ if (n1 >= d1 && (n1 > d1 || n0 >= d0))
{
#if GMP_NAIL_BITS == 0
- sub_ddmmss (r1, r0, r1, r0, d1, d0);
+ sub_ddmmss (n1, n0, n1, n0, d1, d0);
#else
- r0 = r0 - d0;
- r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
- r0 &= GMP_NUMB_MASK;
+ n0 = n0 - d0;
+ n1 = n1 - d1 - (n0 >> GMP_LIMB_BITS - 1);
+ n0 &= GMP_NUMB_MASK;
#endif
most_significant_q_limb = 1;
}
- invert_pi1 (di, d1, d0);
+ use_preinv = ABOVE_THRESHOLD (qxn + nn - 2, DIVREM_2_THRESHOLD);
+ if (use_preinv)
+ invert_limb (d1inv, d1);
- qp += qxn;
-
- for (i = nn - 2 - 1; i >= 0; i--)
+ for (i = qxn + nn - 2 - 1; i >= 0; i--)
{
- mp_limb_t n0, q;
- n0 = np[-1];
- udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di.inv32);
- np--;
- qp[i] = q;
- }
+ mp_limb_t q;
+ mp_limb_t r;
- if (UNLIKELY (qxn != 0))
- {
- qp -= qxn;
- for (i = qxn - 1; i >= 0; i--)
+ if (i >= qxn)
+ np--;
+ else
+ np[0] = 0;
+
+ if (n1 == d1)
+ {
+ /* Q should be either 111..111 or 111..110. Need special handling
+ of this rare case as normal division would give overflow. */
+ q = GMP_NUMB_MASK;
+
+ r = (n0 + d1) & GMP_NUMB_MASK;
+ if (r < d1) /* Carry in the addition? */
+ {
+#if GMP_NAIL_BITS == 0
+ add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+#else
+ n0 = np[0] + d0;
+ n1 = (r - d0 + (n0 >> GMP_NUMB_BITS)) & GMP_NUMB_MASK;
+ n0 &= GMP_NUMB_MASK;
+#endif
+ qp[i] = q;
+ continue;
+ }
+ n1 = d0 - (d0 != 0);
+ n0 = -d0 & GMP_NUMB_MASK;
+ }
+ else
{
- mp_limb_t q;
- udiv_qr_3by2 (q, r1, r0, r1, r0, CNST_LIMB(0), d1, d0, di.inv32);
- qp[i] = q;
+ if (use_preinv)
+ udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+ else
+ udiv_qrnnd (q, r, n1, n0 << GMP_NAIL_BITS, d1 << GMP_NAIL_BITS);
+ r >>= GMP_NAIL_BITS;
+ umul_ppmm (n1, n0, d0, q << GMP_NAIL_BITS);
+ n0 >>= GMP_NAIL_BITS;
}
- }
- np[1] = r1;
- np[0] = r0;
+ n2 = np[0];
+
+ q_test:
+ if (n1 > r || (n1 == r && n0 > n2))
+ {
+ /* The estimated Q was too large. */
+ q--;
+
+#if GMP_NAIL_BITS == 0
+ sub_ddmmss (n1, n0, n1, n0, 0, d0);
+#else
+ n0 = n0 - d0;
+ n1 = n1 - (n0 >> GMP_LIMB_BITS - 1);
+ n0 &= GMP_NUMB_MASK;
+#endif
+ r += d1;
+ if (r >= d1) /* If not carry, test Q again. */
+ goto q_test;
+ }
+
+ qp[i] = q;
+#if GMP_NAIL_BITS == 0
+ sub_ddmmss (n1, n0, r, n2, n1, n0);
+#else
+ n0 = n2 - n0;
+ n1 = r - n1 - (n0 >> GMP_LIMB_BITS - 1);
+ n0 &= GMP_NUMB_MASK;
+#endif
+ }
+ np[1] = n1;
+ np[0] = n0;
return most_significant_q_limb;
}
diff --git a/gmp/mpn/generic/dump.c b/gmp/mpn/generic/dump.c
index 3a73fe49e3..38309996cc 100644
--- a/gmp/mpn/generic/dump.c
+++ b/gmp/mpn/generic/dump.c
@@ -3,33 +3,22 @@
FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 1996, 2000-2002, 2005 Free Software Foundation, Inc.
+Copyright 1996, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <stdio.h>
#include "gmp.h"
diff --git a/gmp/mpn/generic/fib2_ui.c b/gmp/mpn/generic/fib2_ui.c
index eb6e56e736..a39d538262 100644
--- a/gmp/mpn/generic/fib2_ui.c
+++ b/gmp/mpn/generic/fib2_ui.c
@@ -4,37 +4,28 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2001, 2002, 2005, 2009 Free Software Foundation, Inc.
+Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <stdio.h>
#include "gmp.h"
#include "gmp-impl.h"
+#include "longlong.h"
+
/* change this to "#define TRACE(x) x" for diagnostics */
#define TRACE(x)
@@ -61,13 +52,20 @@ see https://www.gnu.org/licenses/. */
This property of F[4m+3] can be verified by induction on F[4m+3] =
7*F[4m-1] - F[4m-5], that formula being a standard lucas sequence
identity U[i+j] = U[i]*V[j] - U[i-j]*Q^j.
-*/
+
+ Enhancements:
+
+ If there was an mpn_addlshift, it'd be possible to eliminate the yp
+ temporary, using xp=F[k]^2, fp=F[k-1]^2, f1p=xp+fp, fp+=4*fp, fp-=f1p,
+ fp+=2*(-1)^n, etc. */
mp_size_t
mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n)
{
+ mp_ptr xp, yp;
mp_size_t size;
unsigned long nfirst, mask;
+ TMP_DECL;
TRACE (printf ("mpn_fib2_ui n=%lu\n", n));
@@ -87,15 +85,15 @@ mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n)
if (mask != 1)
{
mp_size_t alloc;
- mp_ptr xp;
- TMP_DECL;
TMP_MARK;
alloc = MPN_FIB2_SIZE (n);
- xp = TMP_ALLOC_LIMBS (alloc);
+ TMP_ALLOC_LIMBS_2 (xp,alloc, yp,alloc);
do
{
+ mp_limb_t c;
+
/* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from
n&mask upwards.
@@ -116,65 +114,45 @@ mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n)
/* f1p[size-1] might be zero, but this occurs rarely, so it's not
worth bothering checking for it */
ASSERT (alloc >= 2*size);
- mpn_sqr (xp, fp, size);
- mpn_sqr (fp, f1p, size);
+ mpn_sqr_n (xp, fp, size);
+ mpn_sqr_n (yp, f1p, size);
size *= 2;
/* Shrink if possible. Since fp was normalized there'll be at
most one high zero on xp (and if there is then there's one on
yp too). */
- ASSERT (xp[size-1] != 0 || fp[size-1] == 0);
+ ASSERT (xp[size-1] != 0 || yp[size-1] == 0);
size -= (xp[size-1] == 0);
ASSERT (xp[size-1] != 0); /* only one xp high zero */
- /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */
- f1p[size] = mpn_add_n (f1p, xp, fp, size);
-
/* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k.
n&mask is the low bit of our implied k. */
-#if HAVE_NATIVE_mpn_rsblsh2_n || HAVE_NATIVE_mpn_rsblsh_n
-#if HAVE_NATIVE_mpn_rsblsh2_n
- fp[size] = mpn_rsblsh2_n (fp, fp, xp, size);
-#else /* HAVE_NATIVE_mpn_rsblsh_n */
- fp[size] = mpn_rsblsh_n (fp, fp, xp, size, 2);
-#endif
- if ((n & mask) == 0)
- MPN_INCR_U(fp, size + 1, 2); /* possible +2 */
- else
- {
- ASSERT (fp[0] >= 2);
- fp[0] -= 2; /* possible -2 */
- }
-#else
- {
- mp_limb_t c;
-
- c = mpn_lshift (xp, xp, size, 2);
- xp[0] |= (n & mask ? 0 : 2); /* possible +2 */
- c -= mpn_sub_n (fp, xp, fp, size);
- ASSERT (n & mask ? fp[0] != 0 && fp[0] != 1 : 1);
- fp[0] -= (n & mask ? 2 : 0); /* possible -2 */
- fp[size] = c;
- }
-#endif
+ c = mpn_lshift (fp, xp, size, 2);
+ fp[0] |= (n & mask ? 0 : 2); /* possible +2 */
+ c -= mpn_sub_n (fp, fp, yp, size);
+ ASSERT (n & (mask << 1) ? fp[0] != 0 && fp[0] != 1 : 1);
+ fp[0] -= (n & mask ? 2 : 0); /* possible -2 */
ASSERT (alloc >= size+1);
- size += (fp[size] != 0);
+ xp[size] = 0;
+ yp[size] = 0;
+ fp[size] = c;
+ size += (c != 0);
+
+ /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2.
+ F[2k-1]<F[2k+1] so no carry out of "size" limbs. */
+ ASSERT_NOCARRY (mpn_add_n (f1p, xp, yp, size));
/* now n&mask is the new bit of n being considered */
mask >>= 1;
/* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of
F[2k+1] and F[2k-1]. */
- if (n & mask)
- ASSERT_NOCARRY (mpn_sub_n (f1p, fp, f1p, size));
- else {
- ASSERT_NOCARRY (mpn_sub_n ( fp, fp, f1p, size));
-
- /* Can have a high zero after replacing F[2k+1] with F[2k].
- f1p will have a high zero if fp does. */
- ASSERT (fp[size-1] != 0 || f1p[size-1] == 0);
- size -= (fp[size-1] == 0);
- }
+ ASSERT_NOCARRY (mpn_sub_n ((n & mask ? f1p : fp), fp, f1p, size));
+
+ /* Can have a high zero after replacing F[2k+1] with F[2k].
+ f1p will have a high zero if fp does. */
+ ASSERT (fp[size-1] != 0 || f1p[size-1] == 0);
+ size -= (fp[size-1] == 0);
}
while (mask != 1);
diff --git a/gmp/mpn/generic/gcd.c b/gmp/mpn/generic/gcd.c
index b14e1ad888..542e0fe7b8 100644
--- a/gmp/mpn/generic/gcd.c
+++ b/gmp/mpn/generic/gcd.c
@@ -1,33 +1,22 @@
/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
-Copyright 1991, 1993-1998, 2000-2005, 2008, 2010, 2012 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002, 2003,
+2004, 2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -35,7 +24,7 @@ see https://www.gnu.org/licenses/. */
/* Uses the HGCD operation described in
- N. Möller, On Schönhage's algorithm and subquadratic integer gcd
+ N. Möller, On Schönhage's algorithm and subquadratic integer gcd
computation, Math. Comp. 77 (2008), 589-607.
to reduce inputs until they are of size below GCD_DC_THRESHOLD, and
@@ -62,76 +51,6 @@ mp_size_t p_table[P_TABLE_SIZE];
#define CHOOSE_P(n) (2*(n) / 3)
#endif
-struct gcd_ctx
-{
- mp_ptr gp;
- mp_size_t gn;
-};
-
-static void
-gcd_hook (void *p, mp_srcptr gp, mp_size_t gn,
- mp_srcptr qp, mp_size_t qn, int d)
-{
- struct gcd_ctx *ctx = (struct gcd_ctx *) p;
- MPN_COPY (ctx->gp, gp, gn);
- ctx->gn = gn;
-}
-
-#if GMP_NAIL_BITS > 0
-/* Nail supports should be easy, replacing the sub_ddmmss with nails
- * logic. */
-#error Nails not supported.
-#endif
-
-/* Use binary algorithm to compute G <-- GCD (U, V) for usize, vsize == 2.
- Both U and V must be odd. */
-static inline mp_size_t
-gcd_2 (mp_ptr gp, mp_srcptr up, mp_srcptr vp)
-{
- mp_limb_t u0, u1, v0, v1;
- mp_size_t gn;
-
- u0 = up[0];
- u1 = up[1];
- v0 = vp[0];
- v1 = vp[1];
-
- ASSERT (u0 & 1);
- ASSERT (v0 & 1);
-
- /* Check for u0 != v0 needed to ensure that argument to
- * count_trailing_zeros is non-zero. */
- while (u1 != v1 && u0 != v0)
- {
- unsigned long int r;
- if (u1 > v1)
- {
- sub_ddmmss (u1, u0, u1, u0, v1, v0);
- count_trailing_zeros (r, u0);
- u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
- u1 >>= r;
- }
- else /* u1 < v1. */
- {
- sub_ddmmss (v1, v0, v1, v0, u1, u0);
- count_trailing_zeros (r, v0);
- v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
- v1 >>= r;
- }
- }
-
- gp[0] = u0, gp[1] = u1, gn = 1 + (u1 != 0);
-
- /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
- if (u1 == v1 && u0 == v0)
- return gn;
-
- v0 = (u0 == v0) ? ((u1 > v1) ? u1-v1 : v1-u1) : ((u0 > v0) ? u0-v0 : v0-u0);
- gp[0] = mpn_gcd_1 (gp, gn, v0);
-
- return 1;
-}
-
mp_size_t
mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
{
@@ -139,17 +58,13 @@ mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
mp_size_t scratch;
mp_size_t matrix_scratch;
- struct gcd_ctx ctx;
+ mp_size_t gn;
mp_ptr tp;
TMP_DECL;
- ASSERT (usize >= n);
- ASSERT (n > 0);
- ASSERT (vp[n-1] > 0);
-
/* FIXME: Check for small sizes first, before setting up temporary
storage etc. */
- talloc = MPN_GCD_SUBDIV_STEP_ITCH(n);
+ talloc = MPN_GCD_LEHMER_N_ITCH(n);
/* For initial division */
scratch = usize - n + 1;
@@ -192,13 +107,11 @@ mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
if (mpn_zero_p (up, n))
{
MPN_COPY (gp, vp, n);
- ctx.gn = n;
- goto done;
+ TMP_FREE;
+ return n;
}
}
- ctx.gp = gp;
-
#if TUNE_GCD_P
while (CHOOSE_P (n) > 0)
#else
@@ -221,90 +134,153 @@ mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
else
{
/* Temporary storage n */
- n = mpn_gcd_subdiv_step (up, vp, n, 0, gcd_hook, &ctx, tp);
+ n = mpn_gcd_subdiv_step (gp, &gn, up, vp, n, tp);
if (n == 0)
- goto done;
+ {
+ TMP_FREE;
+ return gn;
+ }
}
}
- while (n > 2)
- {
- struct hgcd_matrix1 M;
- mp_limb_t uh, ul, vh, vl;
- mp_limb_t mask;
+ gn = mpn_gcd_lehmer_n (gp, up, vp, n, tp);
+ TMP_FREE;
+ return gn;
+}
- mask = up[n-1] | vp[n-1];
- ASSERT (mask > 0);
+#ifdef TUNE_GCD_P
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "speed.h"
- if (mask & GMP_NUMB_HIGHBIT)
- {
- uh = up[n-1]; ul = up[n-2];
- vh = vp[n-1]; vl = vp[n-2];
- }
- else
- {
- int shift;
+static int
+compare_double(const void *ap, const void *bp)
+{
+ double a = * (const double *) ap;
+ double b = * (const double *) bp;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ else
+ return 0;
+}
- count_leading_zeros (shift, mask);
- uh = MPN_EXTRACT_NUMB (shift, up[n-1], up[n-2]);
- ul = MPN_EXTRACT_NUMB (shift, up[n-2], up[n-3]);
- vh = MPN_EXTRACT_NUMB (shift, vp[n-1], vp[n-2]);
- vl = MPN_EXTRACT_NUMB (shift, vp[n-2], vp[n-3]);
- }
+static double
+median (double *v, size_t n)
+{
+ qsort(v, n, sizeof(*v), compare_double);
- /* Try an mpn_hgcd2 step */
- if (mpn_hgcd2 (uh, ul, vh, vl, &M))
- {
- n = mpn_matrix22_mul1_inverse_vector (&M, tp, up, vp, n);
- MP_PTR_SWAP (up, tp);
- }
- else
- {
- /* mpn_hgcd2 has failed. Then either one of a or b is very
- small, or the difference is very small. Perform one
- subtraction followed by one division. */
+ return v[n/2];
+}
- /* Temporary storage n */
- n = mpn_gcd_subdiv_step (up, vp, n, 0, &gcd_hook, &ctx, tp);
- if (n == 0)
- goto done;
- }
- }
+#define TIME(res, code) do { \
+ double time_measurement[5]; \
+ unsigned time_i; \
+ \
+ for (time_i = 0; time_i < 5; time_i++) \
+ { \
+ speed_starttime(); \
+ code; \
+ time_measurement[time_i] = speed_endtime(); \
+ } \
+ res = median(time_measurement, 5); \
+} while (0)
+
+int
+main(int argc, char *argv)
+{
+ gmp_randstate_t rands;
+ mp_size_t n;
+ mp_ptr ap;
+ mp_ptr bp;
+ mp_ptr up;
+ mp_ptr vp;
+ mp_ptr gp;
+ mp_ptr tp;
+ TMP_DECL;
- ASSERT(up[n-1] | vp[n-1]);
+ /* Unbuffered so if output is redirected to a file it isn't lost if the
+ program is killed part way through. */
+ setbuf (stdout, NULL);
+ setbuf (stderr, NULL);
- if (n == 1)
- {
- *gp = mpn_gcd_1(up, 1, vp[0]);
- ctx.gn = 1;
- goto done;
- }
+ gmp_randinit_default (rands);
- /* Due to the calling convention for mpn_gcd, at most one can be
- even. */
+ TMP_MARK;
- if (! (up[0] & 1))
- MP_PTR_SWAP (up, vp);
+ ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ up = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ tp = TMP_ALLOC_LIMBS (MPN_GCD_LEHMER_N_ITCH (P_TABLE_SIZE));
- ASSERT (up[0] & 1);
+ mpn_random (ap, P_TABLE_SIZE);
+ mpn_random (bp, P_TABLE_SIZE);
- if (vp[0] == 0)
- {
- *gp = mpn_gcd_1 (up, 2, vp[1]);
- ctx.gn = 1;
- goto done;
- }
- else if (! (vp[0] & 1))
+ memset (p_table, 0, sizeof(p_table));
+
+ for (n = 100; n++; n < P_TABLE_SIZE)
{
- int r;
- count_trailing_zeros (r, vp[0]);
- vp[0] = ((vp[1] << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (vp[0] >> r);
- vp[1] >>= r;
- }
+ mp_size_t p;
+ mp_size_t best_p;
+ double best_time;
+ double lehmer_time;
+
+ if (ap[n-1] == 0)
+ ap[n-1] = 1;
+
+ if (bp[n-1] == 0)
+ bp[n-1] = 1;
+
+ p_table[n] = 0;
+ TIME(lehmer_time, {
+ MPN_COPY (up, ap, n);
+ MPN_COPY (vp, bp, n);
+ mpn_gcd_lehmer_n (gp, up, vp, n, tp);
+ });
- ctx.gn = gcd_2(gp, up, vp);
+ best_time = lehmer_time;
+ best_p = 0;
-done:
+ for (p = n * 0.48; p < n * 0.77; p++)
+ {
+ double t;
+
+ p_table[n] = p;
+
+ TIME(t, {
+ MPN_COPY (up, ap, n);
+ MPN_COPY (vp, bp, n);
+ mpn_gcd (gp, up, n, vp, n);
+ });
+
+ if (t < best_time)
+ {
+ best_time = t;
+ best_p = p;
+ }
+ }
+ printf("%6d %6d %5.3g", n, best_p, (double) best_p / n);
+ if (best_p > 0)
+ {
+ double speedup = 100 * (lehmer_time - best_time) / lehmer_time;
+ printf(" %5.3g%%", speedup);
+ if (speedup < 1.0)
+ {
+ printf(" (ignored)");
+ best_p = 0;
+ }
+ }
+ printf("\n");
+
+ p_table[n] = best_p;
+ }
TMP_FREE;
- return ctx.gn;
+ gmp_randclear(rands);
+ return 0;
}
+#endif /* TUNE_GCD_P */
diff --git a/gmp/mpn/generic/gcd_1.c b/gmp/mpn/generic/gcd_1.c
index f6dcb4a2eb..73be15134c 100644
--- a/gmp/mpn/generic/gcd_1.c
+++ b/gmp/mpn/generic/gcd_1.c
@@ -1,54 +1,26 @@
/* mpn_gcd_1 -- mpn and limb greatest common divisor.
-Copyright 1994, 1996, 2000, 2001, 2009, 2012 Free Software Foundation, Inc.
+Copyright 1994, 1996, 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#ifndef GCD_1_METHOD
-#define GCD_1_METHOD 2
-#endif
-
-#define USE_ZEROTAB 0
-
-#if USE_ZEROTAB
-#define MAXSHIFT 4
-#define MASK ((1 << MAXSHIFT) - 1)
-static const unsigned char zerotab[1 << MAXSHIFT] =
-{
-#if MAXSHIFT > 4
- 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-#endif
- 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-};
-#endif
/* Does not work for U == 0 or V == 0. It would be tough to make it work for
V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.
@@ -109,10 +81,6 @@ mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
goto strip_u_maybe;
}
- ASSERT (ulimb & 1);
- ASSERT (vlimb & 1);
-
-#if GCD_1_METHOD == 1
while (ulimb != vlimb)
{
ASSERT (ulimb & 1);
@@ -141,58 +109,6 @@ mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
while ((vlimb & 1) == 0);
}
}
-#else
-# if GCD_1_METHOD == 2
-
- ulimb >>= 1;
- vlimb >>= 1;
-
- while (ulimb != vlimb)
- {
- int c;
- mp_limb_t t;
- mp_limb_t vgtu;
-
- t = ulimb - vlimb;
- vgtu = LIMB_HIGHBIT_TO_MASK (t);
-
- /* v <-- min (u, v) */
- vlimb += (vgtu & t);
-
- /* u <-- |u - v| */
- ulimb = (t ^ vgtu) - vgtu;
-
-#if USE_ZEROTAB
- /* Number of trailing zeros is the same no matter if we look at
- * t or ulimb, but using t gives more parallelism. */
- c = zerotab[t & MASK];
-
- while (UNLIKELY (c == MAXSHIFT))
- {
- ulimb >>= MAXSHIFT;
- if (0)
- strip_u_maybe:
- vlimb >>= 1;
-
- c = zerotab[ulimb & MASK];
- }
-#else
- if (0)
- {
- strip_u_maybe:
- vlimb >>= 1;
- t = ulimb;
- }
- count_trailing_zeros (c, t);
-#endif
- ulimb >>= (c + 1);
- }
-
- vlimb = (vlimb << 1) | 1;
-# else
-# error Unknown GCD_1_METHOD
-# endif
-#endif
done:
return vlimb << zero_bits;
diff --git a/gmp/mpn/generic/gcd_lehmer.c b/gmp/mpn/generic/gcd_lehmer.c
new file mode 100644
index 0000000000..37fd3c590d
--- /dev/null
+++ b/gmp/mpn/generic/gcd_lehmer.c
@@ -0,0 +1,160 @@
+/* gcd_lehmer.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Use binary algorithm to compute G <-- GCD (U, V) for usize, vsize == 2.
+ Both U and V must be odd. */
+static inline mp_size_t
+gcd_2 (mp_ptr gp, mp_srcptr up, mp_srcptr vp)
+{
+ mp_limb_t u0, u1, v0, v1;
+ mp_size_t gn;
+
+ u0 = up[0];
+ u1 = up[1];
+ v0 = vp[0];
+ v1 = vp[1];
+
+ ASSERT (u0 & 1);
+ ASSERT (v0 & 1);
+
+ /* Check for u0 != v0 needed to ensure that argument to
+ * count_trailing_zeros is non-zero. */
+ while (u1 != v1 && u0 != v0)
+ {
+ unsigned long int r;
+ if (u1 > v1)
+ {
+ u1 -= v1 + (u0 < v0);
+ u0 = (u0 - v0) & GMP_NUMB_MASK;
+ count_trailing_zeros (r, u0);
+ u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
+ u1 >>= r;
+ }
+ else /* u1 < v1. */
+ {
+ v1 -= u1 + (v0 < u0);
+ v0 = (v0 - u0) & GMP_NUMB_MASK;
+ count_trailing_zeros (r, v0);
+ v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
+ v1 >>= r;
+ }
+ }
+
+ gp[0] = u0, gp[1] = u1, gn = 1 + (u1 != 0);
+
+ /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
+ if (u1 == v1 && u0 == v0)
+ return gn;
+
+ v0 = (u0 == v0) ? ((u1 > v1) ? u1-v1 : v1-u1) : ((u0 > v0) ? u0-v0 : v0-u0);
+ gp[0] = mpn_gcd_1 (gp, gn, v0);
+
+ return 1;
+}
+
+/* Temporary storage: n */
+mp_size_t
+mpn_gcd_lehmer_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+ /* Relax this requirement, and normalize at the start? Must disallow
+ A = B = 0, though. */
+ ASSERT(ap[n-1] > 0 || bp[n-1] > 0);
+
+ while (n > 2)
+ {
+ struct hgcd_matrix1 M;
+ mp_limb_t ah, al, bh, bl;
+ mp_limb_t mask;
+
+ mask = ap[n-1] | bp[n-1];
+ ASSERT (mask > 0);
+
+ if (mask & GMP_NUMB_HIGHBIT)
+ {
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
+ }
+ else
+ {
+ int shift;
+
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+ al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+ bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+ bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+ }
+
+ /* Try an mpn_nhgcd2 step */
+ if (mpn_hgcd2 (ah, al, bh, bl, &M))
+ {
+ n = mpn_hgcd_mul_matrix1_inverse_vector (&M, tp, ap, bp, n);
+ MP_PTR_SWAP (ap, tp);
+ }
+ else
+ {
+ /* mpn_hgcd2 has failed. Then either one of a or b is very
+ small, or the difference is very small. Perform one
+ subtraction followed by one division. */
+ mp_size_t gn;
+
+ /* Temporary storage n */
+ n = mpn_gcd_subdiv_step (gp, &gn, ap, bp, n, tp);
+ if (n == 0)
+ return gn;
+ }
+ }
+
+ if (n == 1)
+ {
+ *gp = mpn_gcd_1(ap, 1, bp[0]);
+ return 1;
+ }
+
+ /* Due to the calling convention for mpn_gcd, at most one can be
+ even. */
+
+ if (! (ap[0] & 1))
+ MP_PTR_SWAP (ap, bp);
+
+ ASSERT (ap[0] & 1);
+
+ if (bp[0] == 0)
+ {
+ *gp = mpn_gcd_1 (ap, 2, bp[1]);
+ return 1;
+ }
+ else if (! (bp[0] & 1))
+ {
+ int r;
+ count_trailing_zeros (r, bp[0]);
+ bp[0] = ((bp[1] << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (bp[0] >> r);
+ bp[1] >>= r;
+ }
+
+ return gcd_2(gp, ap, bp);
+}
diff --git a/gmp/mpn/generic/gcd_subdiv_step.c b/gmp/mpn/generic/gcd_subdiv_step.c
index 18634bec9f..47c0c26c86 100644
--- a/gmp/mpn/generic/gcd_subdiv_step.c
+++ b/gmp/mpn/generic/gcd_subdiv_step.c
@@ -4,35 +4,22 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2003-2005, 2008, 2010, 2011 Free Software Foundation, Inc.
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-#include <stdlib.h> /* for NULL */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -40,47 +27,17 @@ see https://www.gnu.org/licenses/. */
/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
b is small, or the difference is small. Perform one subtraction
- followed by one division. The normal case is to compute the reduced
- a and b, and return the new size.
-
- If s == 0 (used for gcd and gcdext), returns zero if the gcd is
- found.
-
- If s > 0, don't reduce to size <= s, and return zero if no
- reduction is possible (if either a, b or |a-b| is of size <= s). */
-
-/* The hook function is called as
-
- hook(ctx, gp, gn, qp, qn, d)
-
- in the following cases:
-
- + If A = B at the start, G is the gcd, Q is NULL, d = -1.
-
- + If one input is zero at the start, G is the gcd, Q is NULL,
- d = 0 if A = G and d = 1 if B = G.
-
- Otherwise, if d = 0 we have just subtracted a multiple of A from B,
- and if d = 1 we have subtracted a multiple of B from A.
-
- + If A = B after subtraction, G is the gcd, Q is NULL.
-
- + If we get a zero remainder after division, G is the gcd, Q is the
- quotient.
-
- + Otherwise, G is NULL, Q is the quotient (often 1).
-
- */
+ followed by one division. If the gcd is found, stores it in gp and
+ *gn, and returns zero. Otherwise, compute the reduced a and b, and
+ return the new size. */
+/* FIXME: Check when the smaller number is a single limb, and invoke
+ * mpn_gcd_1. */
mp_size_t
-mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
- gcd_subdiv_step_hook *hook, void *ctx,
- mp_ptr tp)
+mpn_gcd_subdiv_step (mp_ptr gp, mp_size_t *gn,
+ mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
- static const mp_limb_t one = CNST_LIMB(1);
- mp_size_t an, bn, qn;
-
- int swapped;
+ mp_size_t an, bn;
ASSERT (n > 0);
ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
@@ -89,117 +46,59 @@ mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
MPN_NORMALIZE (ap, an);
MPN_NORMALIZE (bp, bn);
- swapped = 0;
-
- /* Arrange so that a < b, subtract b -= a, and maintain
- normalization. */
- if (an == bn)
+ if (UNLIKELY (an == 0))
{
- int c;
- MPN_CMP (c, ap, bp, an);
- if (UNLIKELY (c == 0))
- {
- /* For gcdext, return the smallest of the two cofactors, so
- pass d = -1. */
- if (s == 0)
- hook (ctx, ap, an, NULL, 0, -1);
- return 0;
- }
- else if (c > 0)
- {
- MP_PTR_SWAP (ap, bp);
- swapped ^= 1;
- }
- }
- else
- {
- if (an > bn)
- {
- MPN_PTR_SWAP (ap, an, bp, bn);
- swapped ^= 1;
- }
- }
- if (an <= s)
- {
- if (s == 0)
- hook (ctx, bp, bn, NULL, 0, swapped ^ 1);
+ return_b:
+ MPN_COPY (gp, bp, bn);
+ *gn = bn;
return 0;
}
-
- ASSERT_NOCARRY (mpn_sub (bp, bp, bn, ap, an));
- MPN_NORMALIZE (bp, bn);
- ASSERT (bn > 0);
-
- if (bn <= s)
+ else if (UNLIKELY (bn == 0))
{
- /* Undo subtraction. */
- mp_limb_t cy = mpn_add (bp, ap, an, bp, bn);
- if (cy > 0)
- bp[an] = cy;
+ return_a:
+ MPN_COPY (gp, ap, an);
+ *gn = an;
return 0;
}
- /* Arrange so that a < b */
- if (an == bn)
+ /* Arrange so that a > b, subtract an -= bn, and maintain
+ normalization. */
+ if (an < bn)
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ else if (an == bn)
{
int c;
MPN_CMP (c, ap, bp, an);
if (UNLIKELY (c == 0))
- {
- if (s > 0)
- /* Just record subtraction and return */
- hook (ctx, NULL, 0, &one, 1, swapped);
- else
- /* Found gcd. */
- hook (ctx, bp, bn, NULL, 0, swapped);
- return 0;
- }
-
- hook (ctx, NULL, 0, &one, 1, swapped);
-
- if (c > 0)
- {
- MP_PTR_SWAP (ap, bp);
- swapped ^= 1;
- }
+ goto return_a;
+ else if (c < 0)
+ MP_PTR_SWAP (ap, bp);
}
- else
- {
- hook (ctx, NULL, 0, &one, 1, swapped);
- if (an > bn)
- {
- MPN_PTR_SWAP (ap, an, bp, bn);
- swapped ^= 1;
- }
+ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+ MPN_NORMALIZE (ap, an);
+ ASSERT (an > 0);
+
+ /* Arrange so that a > b, and divide a = q b + r */
+ /* FIXME: an < bn happens when we have cancellation. If that is the
+ common case, then we could reverse the roles of a and b to avoid
+ the swap. */
+ if (an < bn)
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_a;
+ else if (c < 0)
+ MP_PTR_SWAP (ap, bp);
}
- mpn_tdiv_qr (tp, bp, 0, bp, bn, ap, an);
- qn = bn - an + 1;
- bn = an;
- MPN_NORMALIZE (bp, bn);
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
- if (UNLIKELY (bn <= s))
- {
- if (s == 0)
- {
- hook (ctx, ap, an, tp, qn, swapped);
- return 0;
- }
-
- /* Quotient is one too large, so decrement it and add back A. */
- if (bn > 0)
- {
- mp_limb_t cy = mpn_add (bp, ap, an, bp, bn);
- if (cy)
- bp[an++] = cy;
- }
- else
- MPN_COPY (bp, ap, an);
-
- MPN_DECR_U (tp, qn, 1);
- }
+ if (mpn_zero_p (ap, bn))
+ goto return_b;
- hook (ctx, NULL, 0, tp, qn, swapped);
- return an;
+ return bn;
}
diff --git a/gmp/mpn/generic/gcdext.c b/gmp/mpn/generic/gcdext.c
index 1c4ff75aab..38487ae66d 100644
--- a/gmp/mpn/generic/gcdext.c
+++ b/gmp/mpn/generic/gcdext.c
@@ -1,33 +1,22 @@
/* mpn_gcdext -- Extended Greatest Common Divisor.
-Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
-Inc.
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -96,10 +85,10 @@ hgcd_mul_matrix_vector (struct hgcd_matrix *M,
return n;
}
-#define COMPUTE_V_ITCH(n) (2*(n))
+#define COMPUTE_V_ITCH(n) (2*(n) + 1)
/* Computes |v| = |(g - u a)| / b, where u may be positive or
- negative, and v is of the opposite sign. max(a, b) is of size n, u and
+ negative, and v is of the opposite sign. a, b are of size n, u and
v at most size n, and v must have space for n+1 limbs. */
static mp_size_t
compute_v (mp_ptr vp,
@@ -119,11 +108,9 @@ compute_v (mp_ptr vp,
size = ABS (usize);
ASSERT (size <= n);
- ASSERT (up[size-1] > 0);
an = n;
MPN_NORMALIZE (ap, an);
- ASSERT (gn <= an);
if (an >= size)
mpn_mul (tp, ap, an, up, size);
@@ -132,6 +119,8 @@ compute_v (mp_ptr vp,
size += an;
+ ASSERT (gn <= size);
+
if (usize > 0)
{
/* |v| = -v = (u a - g) / b */
@@ -142,11 +131,11 @@ compute_v (mp_ptr vp,
return 0;
}
else
- { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
- (g + |u| a) always fits in (|usize| + an) limbs. */
-
- ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
- size -= (tp[size - 1] == 0);
+ { /* usize < 0 */
+ /* |v| = v = (c - u a) / b = (c + |u| a) / b */
+ mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
+ if (cy)
+ tp[size++] = cy;
}
/* Now divide t / b. There must be no remainder */
@@ -157,9 +146,21 @@ compute_v (mp_ptr vp,
vn = size + 1 - bn;
ASSERT (vn <= n + 1);
- mpn_divexact (vp, tp, size, bp, bn);
+ /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
+ GMP_NUMB_BITS}. */
+ mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
vn -= (vp[vn-1] == 0);
+ /* Remainder must be zero */
+#if WANT_ASSERT
+ {
+ mp_size_t i;
+ for (i = 0; i < bn; i++)
+ {
+ ASSERT (tp[i] == 0);
+ }
+ }
+#endif
return vn;
}
@@ -180,8 +181,7 @@ compute_v (mp_ptr vp,
For the lehmer call after the loop, Let T denote
GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
- for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
- sufficient for both operations.
+ + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
*/
@@ -204,7 +204,6 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
mp_size_t matrix_scratch;
mp_size_t ualloc = n + 1;
- struct gcdext_ctx ctx;
mp_size_t un;
mp_ptr u0;
mp_ptr u1;
@@ -215,7 +214,6 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
ASSERT (an >= n);
ASSERT (n > 0);
- ASSERT (bp[n-1] > 0);
TMP_MARK;
@@ -284,10 +282,6 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
u0 = tp; tp += ualloc;
u1 = tp; tp += ualloc;
- ctx.gp = gp;
- ctx.up = up;
- ctx.usize = usizep;
-
{
/* For the first hgcd call, there are no u updates, and it makes
some sense to use a different choice for p. */
@@ -321,22 +315,21 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
/* mpn_hgcd has failed. Then either one of a or b is very
small, or the difference is very small. Perform one
subtraction followed by one division. */
- u1[0] = 1;
+ mp_size_t gn;
+ mp_size_t updated_un = 1;
- ctx.u0 = u0;
- ctx.u1 = u1;
- ctx.tp = tp + n; /* ualloc */
- ctx.un = 1;
+ u1[0] = 1;
- /* Temporary storage n */
- n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+ /* Temporary storage 2n + 1 */
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+ u0, u1, &updated_un, tp, tp + n);
if (n == 0)
{
TMP_FREE;
- return ctx.gn;
+ return gn;
}
- un = ctx.un;
+ un = updated_un;
ASSERT (un < ualloc);
}
}
@@ -378,45 +371,22 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
/* mpn_hgcd has failed. Then either one of a or b is very
small, or the difference is very small. Perform one
subtraction followed by one division. */
- ctx.u0 = u0;
- ctx.u1 = u1;
- ctx.tp = tp + n; /* ualloc */
- ctx.un = un;
+ mp_size_t gn;
+ mp_size_t updated_un = un;
- /* Temporary storage n */
- n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+ /* Temporary storage 2n + 1 */
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+ u0, u1, &updated_un, tp, tp + n);
if (n == 0)
{
TMP_FREE;
- return ctx.gn;
+ return gn;
}
- un = ctx.un;
+ un = updated_un;
ASSERT (un < ualloc);
}
}
- /* We have A = ... a + ... b
- B = u0 a + u1 b
-
- a = u1 A + ... B
- b = -u0 A + ... B
-
- with bounds
-
- |u0|, |u1| <= B / min(a, b)
-
- We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
- in which case the only reduction done so far is a = A - k B for
- some k.
-
- Compute g = u a + v b = (u u1 - v u0) A + (...) B
- Here, u, v are bounded by
-
- |u| <= b,
- |v| <= a
- */
-
- ASSERT ( (ap[n-1] | bp[n-1]) > 0);
if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
{
@@ -426,10 +396,7 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
MPN_COPY (gp, ap, n);
MPN_CMP (c, u0, u1, un);
- /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
- this case we choose the cofactor + 1, corresponding to G = A
- - k B, rather than -1, corresponding to G = - A + (k+1) B. */
- ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
+ ASSERT (c != 0);
if (c < 0)
{
MPN_NORMALIZE (u0, un);
@@ -446,9 +413,10 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
TMP_FREE;
return n;
}
- else if (UNLIKELY (u0[0] == 0) && un == 1)
+ else if (mpn_zero_p (u0, un))
{
mp_size_t gn;
+ ASSERT (un == 1);
ASSERT (u1[0] == 1);
/* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
@@ -459,6 +427,23 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
}
else
{
+ /* We have A = ... a + ... b
+ B = u0 a + u1 b
+
+ a = u1 A + ... B
+ b = -u0 A + ... B
+
+ with bounds
+
+ |u0|, |u1| <= B / min(a, b)
+
+ Compute g = u a + v b = (u u1 - v u0) A + (...) B
+ Here, u, v are bounded by
+
+ |u| <= b,
+ |v| <= a
+ */
+
mp_size_t u0n;
mp_size_t u1n;
mp_size_t lehmer_un;
@@ -478,8 +463,6 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
u0n = un;
MPN_NORMALIZE (u0, u0n);
- ASSERT (u0n > 0);
-
if (lehmer_un == 0)
{
/* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
@@ -505,12 +488,25 @@ mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
u1n = un;
MPN_NORMALIZE (u1, u1n);
- ASSERT (u1n > 0);
+
+ /* It's possible that u0 = 1, u1 = 0 */
+ if (u1n == 0)
+ {
+ ASSERT (un == 1);
+ ASSERT (u0[0] == 1);
+
+ /* u1 == 0 ==> u u1 + v u0 = v */
+ MPN_COPY (up, lehmer_vp, lehmer_vn);
+ *usizep = negate ? lehmer_vn : - lehmer_vn;
+
+ TMP_FREE;
+ return gn;
+ }
ASSERT (lehmer_un + u1n <= ualloc);
ASSERT (lehmer_vn + u0n <= ualloc);
- /* We may still have v == 0 */
+ /* Now u0, u1, u are non-zero. We may still have v == 0 */
/* Compute u u0 */
if (lehmer_un <= u1n)
diff --git a/gmp/mpn/generic/gcdext_1.c b/gmp/mpn/generic/gcdext_1.c
index ea46cceb72..f1dd9ee963 100644
--- a/gmp/mpn/generic/gcdext_1.c
+++ b/gmp/mpn/generic/gcdext_1.c
@@ -1,273 +1,27 @@
/* mpn_gcdext -- Extended Greatest Common Divisor.
-Copyright 1996, 1998, 2000-2005, 2008, 2009 Free Software Foundation, Inc.
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#ifndef GCDEXT_1_USE_BINARY
-#define GCDEXT_1_USE_BINARY 0
-#endif
-
-#ifndef GCDEXT_1_BINARY_METHOD
-#define GCDEXT_1_BINARY_METHOD 2
-#endif
-
-#ifndef USE_ZEROTAB
-#define USE_ZEROTAB 1
-#endif
-
-#if GCDEXT_1_USE_BINARY
-
-#if USE_ZEROTAB
-static unsigned char zerotab[0x40] = {
- 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
- 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
- 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
- 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-};
-#endif
-
-mp_limb_t
-mpn_gcdext_1 (mp_limb_signed_t *sp, mp_limb_signed_t *tp,
- mp_limb_t u, mp_limb_t v)
-{
- /* Maintain
-
- U = t1 u + t0 v
- V = s1 u + s0 v
-
- where U, V are the inputs (without any shared power of two),
- and the matrix has determinant ± 2^{shift}.
- */
- mp_limb_t s0 = 1;
- mp_limb_t t0 = 0;
- mp_limb_t s1 = 0;
- mp_limb_t t1 = 1;
- mp_limb_t ug;
- mp_limb_t vg;
- mp_limb_t ugh;
- mp_limb_t vgh;
- unsigned zero_bits;
- unsigned shift;
- unsigned i;
-#if GCDEXT_1_BINARY_METHOD == 2
- mp_limb_t det_sign;
-#endif
-
- ASSERT (u > 0);
- ASSERT (v > 0);
-
- count_trailing_zeros (zero_bits, u | v);
- u >>= zero_bits;
- v >>= zero_bits;
-
- if ((u & 1) == 0)
- {
- count_trailing_zeros (shift, u);
- u >>= shift;
- t1 <<= shift;
- }
- else if ((v & 1) == 0)
- {
- count_trailing_zeros (shift, v);
- v >>= shift;
- s0 <<= shift;
- }
- else
- shift = 0;
-
-#if GCDEXT_1_BINARY_METHOD == 1
- while (u != v)
- {
- unsigned count;
- if (u > v)
- {
- u -= v;
-#if USE_ZEROTAB
- count = zerotab [u & 0x3f];
- u >>= count;
- if (UNLIKELY (count == 6))
- {
- unsigned c;
- do
- {
- c = zerotab[u & 0x3f];
- u >>= c;
- count += c;
- }
- while (c == 6);
- }
-#else
- count_trailing_zeros (count, u);
- u >>= count;
-#endif
- t0 += t1; t1 <<= count;
- s0 += s1; s1 <<= count;
- }
- else
- {
- v -= u;
-#if USE_ZEROTAB
- count = zerotab [v & 0x3f];
- v >>= count;
- if (UNLIKELY (count == 6))
- {
- unsigned c;
- do
- {
- c = zerotab[v & 0x3f];
- v >>= c;
- count += c;
- }
- while (c == 6);
- }
-#else
- count_trailing_zeros (count, v);
- v >>= count;
-#endif
- t1 += t0; t0 <<= count;
- s1 += s0; s0 <<= count;
- }
- shift += count;
- }
-#else
-# if GCDEXT_1_BINARY_METHOD == 2
- u >>= 1;
- v >>= 1;
-
- det_sign = 0;
-
- while (u != v)
- {
- unsigned count;
- mp_limb_t d = u - v;
- mp_limb_t vgtu = LIMB_HIGHBIT_TO_MASK (d);
- mp_limb_t sx;
- mp_limb_t tx;
-
- /* When v <= u (vgtu == 0), the updates are:
-
- (u; v) <-- ( (u - v) >> count; v) (det = +(1<<count) for corr. M factor)
- (t1, t0) <-- (t1 << count, t0 + t1)
-
- and when v > 0, the updates are
-
- (u; v) <-- ( (v - u) >> count; u) (det = -(1<<count))
- (t1, t0) <-- (t0 << count, t0 + t1)
-
- and similarly for s1, s0
- */
-
- /* v <-- min (u, v) */
- v += (vgtu & d);
-
- /* u <-- |u - v| */
- u = (d ^ vgtu) - vgtu;
-
- /* Number of trailing zeros is the same no matter if we look at
- * d or u, but using d gives more parallelism. */
-#if USE_ZEROTAB
- count = zerotab[d & 0x3f];
- if (UNLIKELY (count == 6))
- {
- unsigned c = 6;
- do
- {
- d >>= c;
- c = zerotab[d & 0x3f];
- count += c;
- }
- while (c == 6);
- }
-#else
- count_trailing_zeros (count, d);
-#endif
- det_sign ^= vgtu;
-
- tx = vgtu & (t0 - t1);
- sx = vgtu & (s0 - s1);
- t0 += t1;
- s0 += s1;
- t1 += tx;
- s1 += sx;
-
- count++;
- u >>= count;
- t1 <<= count;
- s1 <<= count;
- shift += count;
- }
- u = (u << 1) + 1;
-# else /* GCDEXT_1_BINARY_METHOD == 2 */
-# error Unknown GCDEXT_1_BINARY_METHOD
-# endif
-#endif
-
- /* Now u = v = g = gcd (u,v). Compute U/g and V/g */
- ug = t0 + t1;
- vg = s0 + s1;
-
- ugh = ug/2 + (ug & 1);
- vgh = vg/2 + (vg & 1);
-
- /* Now ±2^{shift} g = s0 U - t0 V. Get rid of the power of two, using
- s0 U - t0 V = (s0 + V/g) U - (t0 + U/g) V. */
- for (i = 0; i < shift; i++)
- {
- mp_limb_t mask = - ( (s0 | t0) & 1);
-
- s0 /= 2;
- t0 /= 2;
- s0 += mask & vgh;
- t0 += mask & ugh;
- }
- /* FIXME: Try simplifying this condition. */
- if ( (s0 > 1 && 2*s0 >= vg) || (t0 > 1 && 2*t0 >= ug) )
- {
- s0 -= vg;
- t0 -= ug;
- }
-#if GCDEXT_1_BINARY_METHOD == 2
- /* Conditional negation. */
- s0 = (s0 ^ det_sign) - det_sign;
- t0 = (t0 ^ det_sign) - det_sign;
-#endif
- *sp = s0;
- *tp = -t0;
-
- return u << zero_bits;
-}
-
-#else /* !GCDEXT_1_USE_BINARY */
-
/* FIXME: Takes two single-word limbs. It could be extended to a
* function that accepts a bignum for the first input, and only
@@ -325,4 +79,3 @@ mpn_gcdext_1 (mp_limb_signed_t *up, mp_limb_signed_t *vp,
v1 -= q * v0;
}
}
-#endif /* !GCDEXT_1_USE_BINARY */
diff --git a/gmp/mpn/generic/gcdext_lehmer.c b/gmp/mpn/generic/gcdext_lehmer.c
index 547f69a409..8599a4f554 100644
--- a/gmp/mpn/generic/gcdext_lehmer.c
+++ b/gmp/mpn/generic/gcdext_lehmer.c
@@ -1,146 +1,31 @@
/* mpn_gcdext -- Extended Greatest Common Divisor.
-Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
-Inc.
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-/* Here, d is the index of the cofactor to update. FIXME: Could use qn
- = 0 for the common case q = 1. */
-void
-mpn_gcdext_hook (void *p, mp_srcptr gp, mp_size_t gn,
- mp_srcptr qp, mp_size_t qn, int d)
-{
- struct gcdext_ctx *ctx = (struct gcdext_ctx *) p;
- mp_size_t un = ctx->un;
-
- if (gp)
- {
- mp_srcptr up;
-
- ASSERT (gn > 0);
- ASSERT (gp[gn-1] > 0);
-
- MPN_COPY (ctx->gp, gp, gn);
- ctx->gn = gn;
-
- if (d < 0)
- {
- int c;
-
- /* Must return the smallest cofactor, +u1 or -u0 */
- MPN_CMP (c, ctx->u0, ctx->u1, un);
- ASSERT (c != 0 || (un == 1 && ctx->u0[0] == 1 && ctx->u1[0] == 1));
-
- d = c < 0;
- }
-
- up = d ? ctx->u0 : ctx->u1;
-
- MPN_NORMALIZE (up, un);
- MPN_COPY (ctx->up, up, un);
-
- *ctx->usize = d ? -un : un;
- }
- else
- {
- mp_limb_t cy;
- mp_ptr u0 = ctx->u0;
- mp_ptr u1 = ctx->u1;
-
- ASSERT (d >= 0);
-
- if (d)
- MP_PTR_SWAP (u0, u1);
-
- qn -= (qp[qn-1] == 0);
-
- /* Update u0 += q * u1 */
- if (qn == 1)
- {
- mp_limb_t q = qp[0];
-
- if (q == 1)
- /* A common case. */
- cy = mpn_add_n (u0, u0, u1, un);
- else
- cy = mpn_addmul_1 (u0, u1, un, q);
- }
- else
- {
- mp_size_t u1n;
- mp_ptr tp;
-
- u1n = un;
- MPN_NORMALIZE (u1, u1n);
-
- if (u1n == 0)
- return;
-
- /* Should always have u1n == un here, and u1 >= u0. The
- reason is that we alternate adding u0 to u1 and u1 to u0
- (corresponding to subtractions a - b and b - a), and we
- can get a large quotient only just after a switch, which
- means that we'll add (a multiple of) the larger u to the
- smaller. */
-
- tp = ctx->tp;
-
- if (qn > u1n)
- mpn_mul (tp, qp, qn, u1, u1n);
- else
- mpn_mul (tp, u1, u1n, qp, qn);
-
- u1n += qn;
- u1n -= tp[u1n-1] == 0;
-
- if (u1n >= un)
- {
- cy = mpn_add (u0, tp, u1n, u0, un);
- un = u1n;
- }
- else
- /* Note: Unlikely case, maybe never happens? */
- cy = mpn_add (u0, u0, un, tp, u1n);
-
- }
- u0[un] = cy;
- ctx->un = un + (cy > 0);
- }
-}
-
-/* Temporary storage: 3*(n+1) for u. If hgcd2 succeeds, we need n for
- the matrix-vector multiplication adjusting a, b. If hgcd fails, we
- need at most n for the quotient and n+1 for the u update (reusing
- the extra u). In all, 4n + 3. */
+/* Temporary storage: 3*(n+1) for u. n+1 for the matrix-vector
+ multiplications (if hgcd2 succeeds). If hgcd fails, n+1 limbs are
+ needed for the division, with most n for the quotient, and n+1 for
+ the product q u0. In all, 4n + 3. */
mp_size_t
mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
@@ -156,16 +41,8 @@ mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
* which correspond to the first column of the inverse
*
* M^{-1} = (u1, -v1; -u0, v0)
- *
- * This implies that
- *
- * a = u1 A (mod B)
- * b = -u0 A (mod B)
- *
- * where A, B denotes the input values.
*/
- struct gcdext_ctx ctx;
mp_size_t un;
mp_ptr u0;
mp_ptr u1;
@@ -178,10 +55,6 @@ mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
u1[0] = 1; un = 1;
- ctx.gp = gp;
- ctx.up = up;
- ctx.usize = usize;
-
/* FIXME: Handle n == 2 differently, after the loop? */
while (n >= 2)
{
@@ -223,7 +96,7 @@ mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
/* Try an mpn_nhgcd2 step */
if (mpn_hgcd2 (ah, al, bh, bl, &M))
{
- n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n);
+ n = mpn_hgcd_mul_matrix1_inverse_vector (&M, tp, ap, bp, n);
MP_PTR_SWAP (ap, tp);
un = mpn_hgcd_mul_matrix1_vector(&M, u2, u0, u1, un);
MP_PTR_SWAP (u0, u2);
@@ -233,18 +106,17 @@ mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
/* mpn_hgcd2 has failed. Then either one of a or b is very
small, or the difference is very small. Perform one
subtraction followed by one division. */
- ctx.u0 = u0;
- ctx.u1 = u1;
- ctx.tp = u2;
- ctx.un = un;
+ mp_size_t gn;
+ mp_size_t updated_un = un;
/* Temporary storage n for the quotient and ualloc for the
new cofactor. */
- n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usize, ap, bp, n,
+ u0, u1, &updated_un, tp, u2);
if (n == 0)
- return ctx.gn;
+ return gn;
- un = ctx.un;
+ un = updated_un;
}
}
ASSERT_ALWAYS (ap[0] > 0);
diff --git a/gmp/mpn/generic/gcdext_subdiv_step.c b/gmp/mpn/generic/gcdext_subdiv_step.c
new file mode 100644
index 0000000000..d54b3bdee1
--- /dev/null
+++ b/gmp/mpn/generic/gcdext_subdiv_step.c
@@ -0,0 +1,197 @@
+/* gcdext_subdiv_step.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+ b is small, or the difference is small. Perform one subtraction
+ followed by one division. If the gcd is found, stores it in gp and
+ *gn, and returns zero. Otherwise, compute the reduced a and b,
+ return the new size, and cofactors. */
+
+/* Temporary storage: Needs n limbs for the quotient, at qp. tp must
+ point to an area large enough for the resulting cofactor, plus one
+ limb extra. All in all, 2N + 1 if N is a bound for both inputs and
+ outputs. */
+mp_size_t
+mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep,
+ mp_ptr ap, mp_ptr bp, mp_size_t n,
+ mp_ptr u0, mp_ptr u1, mp_size_t *unp,
+ mp_ptr qp, mp_ptr tp)
+{
+ mp_size_t an, bn, un;
+ mp_size_t qn;
+ mp_size_t u0n;
+
+ int swapped;
+
+ an = bn = n;
+
+ ASSERT (an > 0);
+ ASSERT (ap[an-1] > 0 || bp[an-1] > 0);
+
+ MPN_NORMALIZE (ap, an);
+ MPN_NORMALIZE (bp, bn);
+
+ un = *unp;
+
+ swapped = 0;
+
+ if (UNLIKELY (an == 0))
+ {
+ return_b:
+ MPN_COPY (gp, bp, bn);
+ *gn = bn;
+
+ MPN_NORMALIZE (u0, un);
+ MPN_COPY (up, u0, un);
+
+ *usizep = swapped ? un : -un;
+
+ return 0;
+ }
+ else if (UNLIKELY (bn == 0))
+ {
+ MPN_COPY (gp, ap, an);
+ *gn = an;
+
+ MPN_NORMALIZE (u1, un);
+ MPN_COPY (up, u1, un);
+
+ *usizep = swapped ? -un : un;
+
+ return 0;
+ }
+
+ /* Arrange so that a > b, subtract an -= bn, and maintain
+ normalization. */
+ if (an < bn)
+ {
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ {
+ MPN_COPY (gp, ap, an);
+ *gn = an;
+
+ /* Must return the smallest cofactor, +u1 or -u0 */
+ MPN_CMP (c, u0, u1, un);
+ ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
+
+ if (c < 0)
+ {
+ MPN_NORMALIZE (u0, un);
+ MPN_COPY (up, u0, un);
+ swapped ^= 1;
+ }
+ else
+ {
+ MPN_NORMALIZE_NOT_ZERO (u1, un);
+ MPN_COPY (up, u1, un);
+ }
+
+ *usizep = swapped ? -un : un;
+ return 0;
+ }
+ else if (c < 0)
+ {
+ MP_PTR_SWAP (ap, bp);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ }
+ /* Reduce a -= b, u1 += u0 */
+ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+ MPN_NORMALIZE (ap, an);
+ ASSERT (an > 0);
+
+ u1[un] = mpn_add_n (u1, u1, u0, un);
+ un += (u1[un] > 0);
+
+ /* Arrange so that a > b, and divide a = q b + r */
+ if (an < bn)
+ {
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_b;
+ else if (c < 0)
+ {
+ MP_PTR_SWAP (ap, bp);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ }
+
+ /* Reduce a -= q b, u1 += q u0 */
+ qn = an - bn + 1;
+ mpn_tdiv_qr (qp, ap, 0, ap, an, bp, bn);
+
+ if (mpn_zero_p (ap, bn))
+ goto return_b;
+
+ n = bn;
+
+ /* Update u1 += q u0 */
+ u0n = un;
+ MPN_NORMALIZE (u0, u0n);
+
+ if (u0n > 0)
+ {
+ qn -= (qp[qn - 1] == 0);
+
+ if (qn > u0n)
+ mpn_mul (tp, qp, qn, u0, u0n);
+ else
+ mpn_mul (tp, u0, u0n, qp, qn);
+
+ if (qn + u0n > un)
+ {
+ ASSERT_NOCARRY (mpn_add (u1, tp, qn + u0n, u1, un));
+ un = qn + u0n;
+ un -= (u1[un-1] == 0);
+ }
+ else
+ {
+ u1[un] = mpn_add (u1, u1, un, tp, qn + u0n);
+ un += (u1[un] > 0);
+ }
+ }
+
+ *unp = un;
+ return n;
+}
diff --git a/gmp/mpn/generic/get_d.c b/gmp/mpn/generic/get_d.c
index d73d314856..cf4ae86efc 100644
--- a/gmp/mpn/generic/get_d.c
+++ b/gmp/mpn/generic/get_d.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2003, 2004, 2007, 2009, 2010, 2012 Free Software Foundation, Inc.
+Copyright 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -40,20 +29,33 @@ see https://www.gnu.org/licenses/. */
#define _GMP_IEEE_FLOATS 0
#endif
+#if ! _GMP_IEEE_FLOATS
+/* dummy definition, just to let dead code compile */
+union ieee_double_extract {
+ struct {
+ int manh, manl, sig, exp;
+ } s;
+ double d;
+};
+#endif
+
/* To force use of the generic C code for testing, put
"#define _GMP_IEEE_FLOATS 0" at this point. */
+
/* In alpha gcc prior to 3.4, signed DI comparisons involving constants are
rearranged from "x < n" to "x+(-n) < 0", which is of course hopelessly
wrong if that addition overflows.
- The workaround here avoids this bug by ensuring n is not a literal constant.
- Note that this is alpha specific. The offending transformation is/was in
- alpha.c alpha_emit_conditional_branch() under "We want to use cmpcc/bcc".
+ The workaround here avoids this bug by ensuring n is not a literal
+ constant. Note that this is alpha specific. The offending transformation
+ is/was in alpha.c alpha_emit_conditional_branch() under "We want to use
+ cmpcc/bcc".
- Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X,
- and has the same solution. Don't know why or how. */
+ Bizarrely, it turns out this happens also with Cray cc on
+ alphaev5-cray-unicosmk2.0.6.X, and has the same solution. Don't know why
+ or how. */
#if HAVE_HOST_CPU_FAMILY_alpha \
&& ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4)) \
@@ -68,73 +70,69 @@ static volatile const long CONST_NEG_1022_SUB_53 = -1022 - 53;
#endif
-/* Return the value {ptr,size}*2^exp, and negative if sign<0. Must have
- size>=1, and a non-zero high limb ptr[size-1].
- When we know the fp format, the result is truncated towards zero. This is
- consistent with other gmp conversions, like mpz_set_f or mpz_set_q, and is
- easy to implement and test.
+/* Return the value {ptr,size}*2^exp, and negative if sign<0.
+ Must have size>=1, and a non-zero high limb ptr[size-1].
- When we do not know the format, such truncation seems much harder. One
- would need to defeat any rounding mode, including round-up.
+ {ptr,size} is truncated towards zero. This is consistent with other gmp
+ conversions, like mpz_set_f or mpz_set_q, and is easy to implement and
+ test.
+
+ In the past conversions had attempted (imperfectly) to let the hardware
+ float rounding mode take effect, but that gets tricky since multiple
+ roundings need to be avoided, or taken into account, and denorms mean the
+ effective precision of the mantissa is not constant. (For reference,
+ mpz_get_d on IEEE systems was ok, except it operated on the absolute
+ value. mpf_get_d and mpq_get_d suffered from multiple roundings and from
+ not always using enough bits to get the rounding right.)
It's felt that GMP is not primarily concerned with hardware floats, and
really isn't enhanced by getting involved with hardware rounding modes
- (which could even be some weird unknown style), so something unambiguous and
- straightforward is best.
+ (which could even be some weird unknown style), so something unambiguous
+ and straightforward is best.
The IEEE code below is the usual case, it knows either a 32-bit or 64-bit
limb and is done with shifts and masks. The 64-bit case in particular
should come out nice and compact.
- The generic code used to work one bit at a time, which was not only slow,
- but implicitly relied upon denorms for intermediates, since the lowest bits'
- weight of a perfectly valid fp number underflows in non-denorm. Therefore,
- the generic code now works limb-per-limb, initially creating a number x such
- that 1 <= x <= BASE. (BASE is reached only as result of rounding.) Then
- x's exponent is scaled with explicit code (not ldexp to avoid libm
- dependency). It is a tap-dance to avoid underflow or overflow, beware!
+ The generic code works one bit at a time, which will be quite slow, but
+ should support any binary-based "double" and be safe against any rounding
+ mode. Note in particular it works on IEEE systems too.
Traps:
- Hardware traps for overflow to infinity, underflow to zero, or unsupported
- denorms may or may not be taken. The IEEE code works bitwise and so
- probably won't trigger them, the generic code works by float operations and
- so probably will. This difference might be thought less than ideal, but
- again its felt straightforward code is better than trying to get intimate
- with hardware exceptions (of perhaps unknown nature).
+ Hardware traps for overflow to infinity, underflow to zero, or
+ unsupported denorms may or may not be taken. The IEEE code works bitwise
+ and so probably won't trigger them, the generic code works by float
+ operations and so probably will. This difference might be thought less
+ than ideal, but again its felt straightforward code is better than trying
+ to get intimate with hardware exceptions (of perhaps unknown nature).
Not done:
- mpz_get_d in the past handled size==1 with a cast limb->double. This might
- still be worthwhile there (for up to the mantissa many bits), but for
- mpn_get_d here, the cost of applying "exp" to the resulting exponent would
- probably use up any benefit a cast may have over bit twiddling. Also, if
- the exponent is pushed into denorm range then bit twiddling is the only
- option, to ensure the desired truncation is obtained.
+ mpz_get_d in the past handled size==1 with a cast limb->double. This
+ might still be worthwhile there (for up to the mantissa many bits), but
+ for mpn_get_d here, the cost of applying "exp" to the resulting exponent
+ would probably use up any benefit a cast may have over bit twiddling.
+ Also, if the exponent is pushed into denorm range then bit twiddling is
+ the only option, to ensure the desired truncation is obtained.
Other:
For reference, note that HPPA 8000, 8200, 8500 and 8600 trap FCNV,UDW,DBL
- to the kernel for values >= 2^63. This makes it slow, and worse the kernel
- Linux (what versions?) apparently uses untested code in its trap handling
- routines, and gets the sign wrong. We don't use such a limb-to-double
- cast, neither in the IEEE or generic code. */
-
+ to the kernel for values >= 2^63. This makes it slow, and worse the
+ Linux kernel (what versions?) apparently uses untested code in its trap
+ handling routines, and gets the sign wrong. We don't use such a limb to
+ double cast, neither in the IEEE or generic code. */
-#undef FORMAT_RECOGNIZED
-
double
mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp)
{
- int lshift, nbits;
- mp_limb_t x, mhi, mlo;
-
ASSERT (size >= 0);
ASSERT_MPN (up, size);
ASSERT (size == 0 || up[size-1] != 0);
@@ -146,11 +144,10 @@ mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp)
overflow. After this exp can of course be reduced to anywhere within
the {up,size} region without underflow. */
if (UNLIKELY ((unsigned long) (GMP_NUMB_BITS * size)
- > ((unsigned long) LONG_MAX - exp)))
+ > (unsigned long) (LONG_MAX - exp)))
{
-#if _GMP_IEEE_FLOATS
- goto ieee_infinity;
-#endif
+ if (_GMP_IEEE_FLOATS)
+ goto ieee_infinity;
/* generic */
exp = LONG_MAX;
@@ -160,253 +157,334 @@ mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp)
exp += GMP_NUMB_BITS * size;
}
-#if _GMP_IEEE_FLOATS
- {
- union ieee_double_extract u;
-
- up += size;
+#if 1
+{
+ int lshift, nbits;
+ union ieee_double_extract u;
+ mp_limb_t x, mhi, mlo;
#if GMP_LIMB_BITS == 64
- mlo = up[-1];
- count_leading_zeros (lshift, mlo);
+ mp_limb_t m;
+ up += size;
+ m = *--up;
+ count_leading_zeros (lshift, m);
- exp -= (lshift - GMP_NAIL_BITS) + 1;
- mlo <<= lshift;
+ exp -= (lshift - GMP_NAIL_BITS) + 1;
+ m <<= lshift;
- nbits = GMP_LIMB_BITS - lshift;
+ nbits = GMP_LIMB_BITS - lshift;
- if (nbits < 53 && size > 1)
+ if (nbits < 53 && size > 1)
+ {
+ x = *--up;
+ x <<= GMP_NAIL_BITS;
+ x >>= nbits;
+ m |= x;
+ nbits += GMP_NUMB_BITS;
+
+ if (LIMBS_PER_DOUBLE >= 3 && nbits < 53 && size > 2)
{
- x = up[-2];
+ x = *--up;
x <<= GMP_NAIL_BITS;
x >>= nbits;
- mlo |= x;
+ m |= x;
nbits += GMP_NUMB_BITS;
-
- if (LIMBS_PER_DOUBLE >= 3 && nbits < 53 && size > 2)
- {
- x = up[-3];
- x <<= GMP_NAIL_BITS;
- x >>= nbits;
- mlo |= x;
- nbits += GMP_NUMB_BITS;
- }
}
- mhi = mlo >> (32 + 11);
- mlo = mlo >> 11; /* later implicitly truncated to 32 bits */
+ }
+ mhi = m >> (32 + 11);
+ mlo = m >> 11;
#endif
#if GMP_LIMB_BITS == 32
- x = *--up;
- count_leading_zeros (lshift, x);
+ up += size;
+ x = *--up, size--;
+ count_leading_zeros (lshift, x);
- exp -= (lshift - GMP_NAIL_BITS) + 1;
- x <<= lshift;
- mhi = x >> 11;
+ exp -= (lshift - GMP_NAIL_BITS) + 1;
+ x <<= lshift;
+ mhi = x >> 11;
- if (lshift < 11) /* FIXME: never true if NUMB < 20 bits */
+ if (lshift < 11) /* FIXME: never true if NUMB < 20 bits */
+ {
+ /* All 20 bits in mhi */
+ mlo = x << 21;
+ /* >= 1 bit in mlo */
+ nbits = GMP_LIMB_BITS - lshift - 21;
+ }
+ else
+ {
+ if (size != 0)
{
- /* All 20 bits in mhi */
- mlo = x << 21;
- /* >= 1 bit in mlo */
- nbits = GMP_LIMB_BITS - lshift - 21;
+ nbits = GMP_LIMB_BITS - lshift;
+
+ x = *--up, size--;
+ x <<= GMP_NAIL_BITS;
+ mhi |= x >> nbits >> 11;
+
+ mlo = x << GMP_LIMB_BITS - nbits - 11;
+ nbits = nbits + 11 - GMP_NAIL_BITS;
}
else
{
- if (size > 1)
- {
- nbits = GMP_LIMB_BITS - lshift;
-
- x = *--up, size--;
- x <<= GMP_NAIL_BITS;
- mhi |= x >> nbits >> 11;
-
- mlo = x << GMP_LIMB_BITS - nbits - 11;
- nbits = nbits + 11 - GMP_NAIL_BITS;
- }
- else
- {
- mlo = 0;
- goto done;
- }
+ mlo = 0;
+ goto done;
}
+ }
- /* Now all needed bits in mhi have been accumulated. Add bits to mlo. */
+ if (LIMBS_PER_DOUBLE >= 2 && nbits < 32 && size != 0)
+ {
+ x = *--up, size--;
+ x <<= GMP_NAIL_BITS;
+ x >>= nbits;
+ mlo |= x;
+ nbits += GMP_NUMB_BITS;
- if (LIMBS_PER_DOUBLE >= 2 && nbits < 32 && size > 1)
+ if (LIMBS_PER_DOUBLE >= 3 && nbits < 32 && size != 0)
{
- x = up[-1];
+ x = *--up, size--;
x <<= GMP_NAIL_BITS;
x >>= nbits;
mlo |= x;
nbits += GMP_NUMB_BITS;
- if (LIMBS_PER_DOUBLE >= 3 && nbits < 32 && size > 2)
+ if (LIMBS_PER_DOUBLE >= 4 && nbits < 32 && size != 0)
{
- x = up[-2];
+ x = *--up;
x <<= GMP_NAIL_BITS;
x >>= nbits;
mlo |= x;
nbits += GMP_NUMB_BITS;
-
- if (LIMBS_PER_DOUBLE >= 4 && nbits < 32 && size > 3)
- {
- x = up[-3];
- x <<= GMP_NAIL_BITS;
- x >>= nbits;
- mlo |= x;
- nbits += GMP_NUMB_BITS;
- }
}
}
+ }
- done:;
+ done:;
#endif
+ {
+ if (UNLIKELY (exp >= CONST_1024))
+ {
+ /* overflow, return infinity */
+ ieee_infinity:
+ mhi = 0;
+ mlo = 0;
+ exp = 1024;
+ }
+ else if (UNLIKELY (exp <= CONST_NEG_1023))
+ {
+ int rshift;
+
+ if (LIKELY (exp <= CONST_NEG_1022_SUB_53))
+ return 0.0; /* denorm underflows to zero */
+
+ rshift = -1022 - exp;
+ ASSERT (rshift > 0 && rshift < 53);
+#if GMP_LIMB_BITS > 53
+ mlo >>= rshift;
+ mhi = mlo >> 32;
+#else
+ if (rshift >= 32)
+ {
+ mlo = mhi;
+ mhi = 0;
+ rshift -= 32;
+ }
+ lshift = GMP_LIMB_BITS - rshift;
+ mlo = (mlo >> rshift) | (rshift == 0 ? 0 : mhi << lshift);
+ mhi >>= rshift;
+#endif
+ exp = -1023;
+ }
+ }
+ u.s.manh = mhi;
+ u.s.manl = mlo;
+ u.s.exp = exp + 1023;
+ u.s.sig = (sign < 0);
+ return u.d;
+}
+#else
+
+
+#define ONE_LIMB (GMP_LIMB_BITS == 64 && 2*GMP_NUMB_BITS >= 53)
+#define TWO_LIMBS (GMP_LIMB_BITS == 32 && 3*GMP_NUMB_BITS >= 53)
+
+ if (_GMP_IEEE_FLOATS && (ONE_LIMB || TWO_LIMBS))
+ {
+ union ieee_double_extract u;
+ mp_limb_t m0, m1, m2, rmask;
+ int lshift, rshift;
+
+ m0 = up[size-1]; /* high limb */
+ m1 = (size >= 2 ? up[size-2] : 0); /* second highest limb */
+ count_leading_zeros (lshift, m0);
+
+ /* relative to just under high non-zero bit */
+ exp -= (lshift - GMP_NAIL_BITS) + 1;
+
+ if (ONE_LIMB)
+ {
+ /* lshift to have high of m0 non-zero, and collapse nails */
+ rshift = GMP_LIMB_BITS - lshift;
+ m1 <<= GMP_NAIL_BITS;
+ rmask = GMP_NAIL_BITS == 0 && lshift == 0 ? 0 : MP_LIMB_T_MAX;
+ m0 = (m0 << lshift) | ((m1 >> rshift) & rmask);
+
+ /* rshift back to have bit 53 of m0 the high non-zero */
+ m0 >>= 11;
+ }
+ else /* TWO_LIMBS */
+ {
+ m2 = (size >= 3 ? up[size-3] : 0); /* third highest limb */
+
+ /* collapse nails from m1 and m2 */
+#if GMP_NAIL_BITS != 0
+ m1 = (m1 << GMP_NAIL_BITS) | (m2 >> (GMP_NUMB_BITS-GMP_NAIL_BITS));
+ m2 <<= 2*GMP_NAIL_BITS;
+#endif
+
+ /* lshift to have high of m0:m1 non-zero, collapse nails from m0 */
+ rshift = GMP_LIMB_BITS - lshift;
+ rmask = (GMP_NAIL_BITS == 0 && lshift == 0 ? 0 : MP_LIMB_T_MAX);
+ m0 = (m0 << lshift) | ((m1 >> rshift) & rmask);
+ m1 = (m1 << lshift) | ((m2 >> rshift) & rmask);
+
+ /* rshift back to have bit 53 of m0:m1 the high non-zero */
+ m1 = (m1 >> 11) | (m0 << (GMP_LIMB_BITS-11));
+ m0 >>= 11;
+ }
+
if (UNLIKELY (exp >= CONST_1024))
{
/* overflow, return infinity */
ieee_infinity:
- mhi = 0;
- mlo = 0;
+ m0 = 0;
+ m1 = 0;
exp = 1024;
}
else if (UNLIKELY (exp <= CONST_NEG_1023))
{
- int rshift;
-
if (LIKELY (exp <= CONST_NEG_1022_SUB_53))
return 0.0; /* denorm underflows to zero */
rshift = -1022 - exp;
ASSERT (rshift > 0 && rshift < 53);
-#if GMP_LIMB_BITS > 53
- mlo >>= rshift;
- mhi = mlo >> 32;
-#else
- if (rshift >= 32)
+ if (ONE_LIMB)
{
- mlo = mhi;
- mhi = 0;
- rshift -= 32;
+ m0 >>= rshift;
}
- lshift = GMP_LIMB_BITS - rshift;
- mlo = (mlo >> rshift) | (rshift == 0 ? 0 : mhi << lshift);
- mhi >>= rshift;
-#endif
- exp = -1023;
- }
- u.s.manh = mhi;
- u.s.manl = mlo;
- u.s.exp = exp + 1023;
- u.s.sig = (sign < 0);
- return u.d;
- }
-#define FORMAT_RECOGNIZED 1
-#endif
-
-#if HAVE_DOUBLE_VAX_D
- {
- union double_extract u;
-
- up += size;
-
- mhi = up[-1];
-
- count_leading_zeros (lshift, mhi);
- exp -= lshift;
- mhi <<= lshift;
-
- mlo = 0;
- if (size > 1)
- {
- mlo = up[-2];
- if (lshift != 0)
- mhi += mlo >> (GMP_LIMB_BITS - lshift);
- mlo <<= lshift;
-
- if (size > 2 && lshift > 8)
+ else /* TWO_LIMBS */
{
- x = up[-3];
- mlo += x >> (GMP_LIMB_BITS - lshift);
+ if (rshift >= 32)
+ {
+ m1 = m0;
+ m0 = 0;
+ rshift -= 32;
+ }
+ lshift = GMP_LIMB_BITS - rshift;
+ m1 = (m1 >> rshift) | (rshift == 0 ? 0 : m0 << lshift);
+ m0 >>= rshift;
}
+ exp = -1023;
}
- if (UNLIKELY (exp >= 128))
+ if (ONE_LIMB)
{
- /* overflow, return maximum number */
- mhi = 0xffffffff;
- mlo = 0xffffffff;
- exp = 127;
+#if GMP_LIMB_BITS > 32 /* avoid compiler warning about big shift */
+ u.s.manh = m0 >> 32;
+#endif
+ u.s.manl = m0;
}
- else if (UNLIKELY (exp < -128))
+ else /* TWO_LIMBS */
{
- return 0.0; /* underflows to zero */
+ u.s.manh = m0;
+ u.s.manl = m1;
}
- u.s.man3 = mhi >> 24; /* drop msb, since implicit */
- u.s.man2 = mhi >> 8;
- u.s.man1 = (mhi << 8) + (mlo >> 24);
- u.s.man0 = mlo >> 8;
- u.s.exp = exp + 128;
- u.s.sig = sign < 0;
+ u.s.exp = exp + 1023;
+ u.s.sig = (sign < 0);
return u.d;
}
-#define FORMAT_RECOGNIZED 1
-#endif
-
-#if ! FORMAT_RECOGNIZED
- { /* Non-IEEE or strange limb size, do something generic. */
- mp_size_t i;
- double d, weight;
- unsigned long uexp;
-
- /* First generate an fp number disregarding exp, instead keeping things
- within the numb base factor from 1, which should prevent overflow and
- underflow even for the most exponent limited fp formats. The
- termination criteria should be refined, since we now include too many
- limbs. */
- weight = 1/MP_BASE_AS_DOUBLE;
- d = up[size - 1];
- for (i = size - 2; i >= 0; i--)
+ else
+ {
+ /* Non-IEEE or strange limb size, do something generic. */
+
+ mp_size_t i;
+ mp_limb_t limb, bit;
+ int shift;
+ double base, factor, prev_factor, d, new_d, diff;
+
+ /* "limb" is "up[i]" the limb being examined, "bit" is a mask for the
+ bit being examined, initially the highest non-zero bit. */
+ i = size-1;
+ limb = up[i];
+ count_leading_zeros (shift, limb);
+ bit = GMP_LIMB_HIGHBIT >> shift;
+
+ /* relative to just under high non-zero bit */
+ exp -= (shift - GMP_NAIL_BITS) + 1;
+
+ /* Power up "factor" to 2^exp, being the value of the "bit" in "limb"
+ being examined. */
+ base = (exp >= 0 ? 2.0 : 0.5);
+ exp = ABS (exp);
+ factor = 1.0;
+ for (;;)
{
- d += up[i] * weight;
- weight /= MP_BASE_AS_DOUBLE;
- if (weight == 0)
+ if (exp & 1)
+ {
+ prev_factor = factor;
+ factor *= base;
+ FORCE_DOUBLE (factor);
+ if (factor == 0.0)
+ return 0.0; /* underflow */
+ if (factor == prev_factor)
+ {
+ d = factor; /* overflow, apparent infinity */
+ goto generic_done;
+ }
+ }
+ exp >>= 1;
+ if (exp == 0)
break;
+ base *= base;
}
- /* Now apply exp. */
- exp -= GMP_NUMB_BITS;
- if (exp > 0)
- {
- weight = 2.0;
- uexp = exp;
- }
- else
- {
- weight = 0.5;
- uexp = 1 - (unsigned long) (exp + 1);
- }
-#if 1
- /* Square-and-multiply exponentiation. */
- if (uexp & 1)
- d *= weight;
- while (uexp >>= 1)
- {
- weight *= weight;
- if (uexp & 1)
- d *= weight;
- }
-#else
- /* Plain exponentiation. */
- while (uexp > 0)
+ /* Add a "factor" for each non-zero bit, working from high to low.
+ Stop if any rounding occurs, hence implementing a truncation.
+
+ Note no attention is paid to DBL_MANT_DIG, since the effective
+ number of bits in the mantissa isn't constant when in denorm range.
+ We also encountered an ARM system with apparently somewhat doubtful
+ software floats where DBL_MANT_DIG claimed 53 bits but only 32
+ actually worked. */
+
+ d = factor; /* high bit */
+ for (;;)
{
- d *= weight;
- uexp--;
+ factor *= 0.5; /* next bit */
+ bit >>= 1;
+ if (bit == 0)
+ {
+ /* next limb, if any */
+ i--;
+ if (i < 0)
+ break;
+ limb = up[i];
+ bit = GMP_NUMB_HIGHBIT;
+ }
+
+ if (bit & limb)
+ {
+ new_d = d + factor;
+ FORCE_DOUBLE (new_d);
+ diff = new_d - d;
+ if (diff != factor)
+ break; /* rounding occured, stop now */
+ d = new_d;
+ }
}
-#endif
- return sign >= 0 ? d : -d;
+ generic_done:
+ return (sign >= 0 ? d : -d);
}
#endif
}
diff --git a/gmp/mpn/generic/get_str.c b/gmp/mpn/generic/get_str.c
index 42e93c9cee..df007578cc 100644
--- a/gmp/mpn/generic/get_str.c
+++ b/gmp/mpn/generic/get_str.c
@@ -7,34 +7,23 @@
FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE
GNU MP RELEASE.
-Copyright 1991-1994, 1996, 2000-2002, 2004, 2006-2008, 2011, 2012 Free Software
-Foundation, Inc.
+Copyright 1991, 1992, 1993, 1994, 1996, 2000, 2001, 2002, 2004, 2006, 2007,
+2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -361,8 +350,7 @@ mpn_dc_get_str (unsigned char *str, size_t len,
/* There are no leading zeros on the digits generated at str, but that's not
- currently a documented feature. The current mpz_out_str and mpz_get_str
- rely on it. */
+ currently a documented feature. */
size_t
mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
@@ -394,7 +382,7 @@ mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
int bit_pos;
mp_size_t i;
unsigned char *s = str;
- mp_bitcnt_t bits;
+ unsigned long bits;
n1 = up[un - 1];
count_leading_zeros (cnt, n1);
@@ -403,11 +391,11 @@ mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
R + bits_per_digit * n when input ends in nth least significant
nibble. */
- bits = (mp_bitcnt_t) GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS;
+ bits = GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS;
cnt = bits % bits_per_digit;
if (cnt != 0)
bits += bits_per_digit - cnt;
- bit_pos = bits - (mp_bitcnt_t) (un - 1) * GMP_NUMB_BITS;
+ bit_pos = bits - (un - 1) * GMP_NUMB_BITS;
/* Fast loop for bit output. */
i = un - 1;
@@ -451,12 +439,9 @@ mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
mp_size_t n_pows, xn, pn, exptab[GMP_LIMB_BITS], bexp;
mp_limb_t cy;
mp_size_t shift;
- size_t ndig;
-
- DIGITS_IN_BASE_PER_LIMB (ndig, un, base);
- xn = 1 + ndig / mp_bases[base].chars_per_limb; /* FIXME: scalar integer division */
n_pows = 0;
+ xn = 1 + un*(mp_bases[base].chars_per_bit_exactly*GMP_NUMB_BITS)/mp_bases[base].chars_per_limb;
for (pn = xn; pn != 1; pn = (pn + 1) >> 1)
{
exptab[n_pows] = pn;
@@ -488,7 +473,7 @@ mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
ASSERT_ALWAYS (powtab_mem_ptr < powtab_mem + mpn_dc_get_str_powtab_alloc (un));
- mpn_sqr (t, p, n);
+ mpn_sqr_n (t, p, n);
digits_in_base *= 2;
n *= 2; n -= t[n - 1] == 0;
@@ -546,7 +531,7 @@ mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
/* Using our precomputed powers, now in powtab[], convert our number. */
tmp = TMP_BALLOC_LIMBS (mpn_dc_get_str_itch (un));
- out_len = mpn_dc_get_str (str, 0, up, un, powtab + (pi - 1), tmp) - str;
+ out_len = mpn_dc_get_str (str, 0, up, un, powtab - 1 + pi, tmp) - str;
TMP_FREE;
return out_len;
diff --git a/gmp/mpn/generic/gmp-mparam.h b/gmp/mpn/generic/gmp-mparam.h
index 7dc057aa0c..b22b96ef67 100644
--- a/gmp/mpn/generic/gmp-mparam.h
+++ b/gmp/mpn/generic/gmp-mparam.h
@@ -5,29 +5,18 @@ Copyright 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-/* Values for GMP_LIMB_BITS etc will be determined by ./configure and put
+/* Values for BITS_PER_MP_LIMB etc will be determined by ./configure and put
in config.h. */
diff --git a/gmp/mpn/generic/hgcd.c b/gmp/mpn/generic/hgcd.c
index e27a9bdd82..5fc650bbd9 100644
--- a/gmp/mpn/generic/hgcd.c
+++ b/gmp/mpn/generic/hgcd.c
@@ -4,38 +4,497 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
+/* For input of size n, matrix elements are of size at most ceil(n/2)
+ - 1, but we need two limbs extra. */
+void
+mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
+{
+ mp_size_t s = (n+1)/2 + 1;
+ M->alloc = s;
+ M->n = 1;
+ MPN_ZERO (p, 4 * s);
+ M->p[0][0] = p;
+ M->p[0][1] = p + s;
+ M->p[1][0] = p + 2 * s;
+ M->p[1][1] = p + 3 * s;
+
+ M->p[0][0][0] = M->p[1][1][0] = 1;
+}
+
+/* Updated column COL, adding in column (1-COL). */
+static void
+hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
+{
+ mp_limb_t c0, c1;
+ ASSERT (col < 2);
+
+ c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
+ c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
+
+ M->p[0][col][M->n] = c0;
+ M->p[1][col][M->n] = c1;
+
+ M->n += (c0 | c1) != 0;
+ ASSERT (M->n < M->alloc);
+}
+
+/* Updated column COL, adding in column Q * (1-COL). Temporary
+ * storage: qn + n <= M->alloc, where n is the size of the largest
+ * element in column 1 - COL. */
+static void
+hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
+ unsigned col, mp_ptr tp)
+{
+ ASSERT (col < 2);
+
+ if (qn == 1)
+ {
+ mp_limb_t q = qp[0];
+ mp_limb_t c0, c1;
+
+ c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
+ c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
+
+ M->p[0][col][M->n] = c0;
+ M->p[1][col][M->n] = c1;
+
+ M->n += (c0 | c1) != 0;
+ }
+ else
+ {
+ unsigned row;
+
+ /* Carries for the unlikely case that we get both high words
+ from the multiplication and carries from the addition. */
+ mp_limb_t c[2];
+ mp_size_t n;
+
+ /* The matrix will not necessarily grow in size by qn, so we
+ need normalization in order not to overflow M. */
+
+ for (n = M->n; n + qn > M->n; n--)
+ {
+ ASSERT (n > 0);
+ if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
+ break;
+ }
+
+ ASSERT (qn + n <= M->alloc);
+
+ for (row = 0; row < 2; row++)
+ {
+ if (qn <= n)
+ mpn_mul (tp, M->p[row][1-col], n, qp, qn);
+ else
+ mpn_mul (tp, qp, qn, M->p[row][1-col], n);
+
+ ASSERT (n + qn >= M->n);
+ c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
+ }
+ if (c[0] | c[1])
+ {
+ M->n = n + qn + 1;
+ M->p[0][col][n-1] = c[0];
+ M->p[1][col][n-1] = c[1];
+ }
+ else
+ {
+ n += qn;
+ n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
+ if (n > M->n)
+ M->n = n;
+ }
+ }
+
+ ASSERT (M->n < M->alloc);
+}
+
+/* Multiply M by M1 from the right. Since the M1 elements fit in
+ GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
+ temporary space M->n */
+static void
+hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
+ mp_ptr tp)
+{
+ mp_size_t n0, n1;
+
+ /* Could avoid copy by some swapping of pointers. */
+ MPN_COPY (tp, M->p[0][0], M->n);
+ n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
+ MPN_COPY (tp, M->p[1][0], M->n);
+ n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
+
+ /* Depends on zero initialization */
+ M->n = MAX(n0, n1);
+ ASSERT (M->n < M->alloc);
+}
+
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+ division. Reduces the size by almost one limb or more, but never
+ below the given size s. Return new size for a and b, or 0 if no
+ more steps are possible.
+
+ If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
+ limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+ fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
+ hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+ resulting size of $.
+
+ If N is the input size to the calling hgcd, then s = floor(N/2) +
+ 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
+ < N, so N is sufficient.
+*/
+
+static mp_size_t
+hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+ struct hgcd_matrix *M, mp_ptr tp)
+{
+ struct hgcd_matrix1 M1;
+ mp_limb_t mask;
+ mp_limb_t ah, al, bh, bl;
+ mp_size_t an, bn, qn;
+ int col;
+
+ ASSERT (n > s);
+
+ mask = ap[n-1] | bp[n-1];
+ ASSERT (mask > 0);
+
+ if (n == s + 1)
+ {
+ if (mask < 4)
+ goto subtract;
+
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
+ }
+ else if (mask & GMP_NUMB_HIGHBIT)
+ {
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
+ }
+ else
+ {
+ int shift;
+
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+ al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+ bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+ bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+ }
+
+ /* Try an mpn_hgcd2 step */
+ if (mpn_hgcd2 (ah, al, bh, bl, &M1))
+ {
+ /* Multiply M <- M * M1 */
+ hgcd_matrix_mul_1 (M, &M1, tp);
+
+ /* Can't swap inputs, so we need to copy. */
+ MPN_COPY (tp, ap, n);
+ /* Multiply M1^{-1} (a;b) */
+ return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n);
+ }
+
+ subtract:
+ /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
+ bh was too small, or ah, bh were (almost) equal. Perform one
+ subtraction step (for possible cancellation of high limbs),
+ followed by one division. */
+
+ /* Since we must ensure that #(a-b) > s, we handle cancellation of
+ high limbs explicitly up front. (FIXME: Or is it better to just
+ subtract, normalize, and use an addition to undo if it turns out
+ the the difference is too small?) */
+ for (an = n; an > s; an--)
+ if (ap[an-1] != bp[an-1])
+ break;
+
+ if (an == s)
+ return 0;
+
+ /* Maintain a > b. When needed, swap a and b, and let col keep track
+ of how to update M. */
+ if (ap[an-1] > bp[an-1])
+ {
+ /* a is largest. In the subtraction step, we need to update
+ column 1 of M */
+ col = 1;
+ }
+ else
+ {
+ MP_PTR_SWAP (ap, bp);
+ col = 0;
+ }
+
+ bn = n;
+ MPN_NORMALIZE (bp, bn);
+ if (bn <= s)
+ return 0;
+
+ /* We have #a, #b > s. When is it possible that #(a-b) < s? For
+ cancellation to happen, the numbers must be of the form
+
+ a = x + 1, 0, ..., 0, al
+ b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
+
+ where al, bl denotes the least significant k limbs. If al < bl,
+ then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
+ then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
+
+ if (ap[an-1] == bp[an-1] + 1)
+ {
+ mp_size_t k;
+ int c;
+ for (k = an-1; k > s; k--)
+ if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
+ break;
+
+ MPN_CMP (c, ap, bp, k);
+ if (c < 0)
+ {
+ mp_limb_t cy;
+
+ /* The limbs from k and up are cancelled. */
+ if (k == s)
+ return 0;
+ cy = mpn_sub_n (ap, ap, bp, k);
+ ASSERT (cy == 1);
+ an = k;
+ }
+ else
+ {
+ ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
+ ap[k] = 1;
+ an = k + 1;
+ }
+ }
+ else
+ ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
+
+ ASSERT (an > s);
+ ASSERT (ap[an-1] > 0);
+ ASSERT (bn > s);
+ ASSERT (bp[bn-1] > 0);
+
+ hgcd_matrix_update_1 (M, col);
+
+ if (an < bn)
+ {
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ col ^= 1;
+ }
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (c < 0)
+ {
+ MP_PTR_SWAP (ap, bp);
+ col ^= 1;
+ }
+ }
+
+ /* Divide a / b. */
+ qn = an + 1 - bn;
+
+ /* FIXME: We could use an approximate division, that may return a
+ too small quotient, and only guarantee that the size of r is
+ almost the size of b. FIXME: Let ap and remainder overlap. */
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+ qn -= (tp[qn -1] == 0);
+
+ /* Normalize remainder */
+ an = bn;
+ for ( ; an > s; an--)
+ if (ap[an-1] > 0)
+ break;
+
+ if (an <= s)
+ {
+ /* Quotient is too large */
+ mp_limb_t cy;
+
+ cy = mpn_add (ap, bp, bn, ap, an);
+
+ if (cy > 0)
+ {
+ ASSERT (bn < n);
+ ap[bn] = cy;
+ bp[bn] = 0;
+ bn++;
+ }
+
+ MPN_DECR_U (tp, qn, 1);
+ qn -= (tp[qn-1] == 0);
+ }
+
+ if (qn > 0)
+ hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
+
+ return bn;
+}
+
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+ with elements of size at most (n+1)/2 - 1. Returns new size of a,
+ b, or zero if no reduction is possible. */
+mp_size_t
+mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
+ struct hgcd_matrix *M, mp_ptr tp)
+{
+ mp_size_t s = n/2 + 1;
+ mp_size_t nn;
+
+ ASSERT (n > s);
+ ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn)
+ return 0;
+
+ for (;;)
+ {
+ n = nn;
+ ASSERT (n > s);
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn )
+ return n;
+ }
+}
+
+/* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs
+ of temporary storage (see mpn_matrix22_mul_itch). */
+void
+mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
+ mp_ptr tp)
+{
+ mp_size_t n;
+
+ /* About the new size of M:s elements. Since M1's diagonal elements
+ are > 0, no element can decrease. The new elements are of size
+ M->n + M1->n, one limb more or less. The computation of the
+ matrix product produces elements of size M->n + M1->n + 1. But
+ the true size, after normalization, may be three limbs smaller. */
+
+ /* FIXME: Strassen multiplication gives only a small speedup. In FFT
+ multiplication range, this function could be sped up quite a lot
+ using invariance. */
+ ASSERT (M->n + M1->n < M->alloc);
+
+ ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
+ | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
+
+ ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
+ | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
+
+ mpn_matrix22_mul (M->p[0][0], M->p[0][1],
+ M->p[1][0], M->p[1][1], M->n,
+ M1->p[0][0], M1->p[0][1],
+ M1->p[1][0], M1->p[1][1], M1->n, tp);
+
+ /* Index of last potentially non-zero limb, size is one greater. */
+ n = M->n + M1->n;
+
+ n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+ n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+ n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+
+ ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
+
+ M->n = n + 1;
+}
+
+/* Multiplies the least significant p limbs of (a;b) by M^-1.
+ Temporary space needed: 2 * (p + M->n)*/
+mp_size_t
+mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
+ mp_size_t n, mp_ptr ap, mp_ptr bp,
+ mp_size_t p, mp_ptr tp)
+{
+ /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
+ = (r11 a - r01 b; - r10 a + r00 b */
+
+ mp_ptr t0 = tp;
+ mp_ptr t1 = tp + p + M->n;
+ mp_limb_t ah, bh;
+ mp_limb_t cy;
+
+ ASSERT (p + M->n < n);
+
+ /* First compute the two values depending on a, before overwriting a */
+
+ if (M->n >= p)
+ {
+ mpn_mul (t0, M->p[1][1], M->n, ap, p);
+ mpn_mul (t1, M->p[1][0], M->n, ap, p);
+ }
+ else
+ {
+ mpn_mul (t0, ap, p, M->p[1][1], M->n);
+ mpn_mul (t1, ap, p, M->p[1][0], M->n);
+ }
+
+ /* Update a */
+ MPN_COPY (ap, t0, p);
+ ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
+
+ if (M->n >= p)
+ mpn_mul (t0, M->p[0][1], M->n, bp, p);
+ else
+ mpn_mul (t0, bp, p, M->p[0][1], M->n);
+
+ cy = mpn_sub (ap, ap, n, t0, p + M->n);
+ ASSERT (cy <= ah);
+ ah -= cy;
+
+ /* Update b */
+ if (M->n >= p)
+ mpn_mul (t0, M->p[0][0], M->n, bp, p);
+ else
+ mpn_mul (t0, bp, p, M->p[0][0], M->n);
+
+ MPN_COPY (bp, t0, p);
+ bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
+ cy = mpn_sub (bp, bp, n, t1, p + M->n);
+ ASSERT (cy <= bh);
+ bh -= cy;
+
+ if (ah > 0 || bh > 0)
+ {
+ ap[n] = ah;
+ bp[n] = bh;
+ n++;
+ }
+ else
+ {
+ /* The subtraction can reduce the size by at most one limb. */
+ if (ap[n-1] == 0 && bp[n-1] == 0)
+ n--;
+ }
+ ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+ return n;
+}
/* Size analysis for hgcd:
@@ -46,15 +505,16 @@ see https://www.gnu.org/licenses/. */
Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
= 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
- and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total,
- 4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12.
+ and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total,
+ 4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5.
For the recursive call, we need S(n1) = S(ceil(n/2)).
- S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2))
- <= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k))
- <= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k))
- <= 20 ceil(n/4) + 22k + S(ceil(n/2^k))
+ S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2))
+ <= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k))
+ <= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k)
+ <= 24 ceil(n/4) + 17k + S(n/2^k)
+
*/
mp_size_t
@@ -65,14 +525,15 @@ mpn_hgcd_itch (mp_size_t n)
mp_size_t nscaled;
if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
- return n;
+ return MPN_HGCD_LEHMER_ITCH (n);
/* Get the recursion depth. */
nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
count_leading_zeros (count, nscaled);
k = GMP_LIMB_BITS - count;
- return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
+ return 24 * ((n+3) / 4) + 17 * k
+ + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
}
/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
@@ -84,8 +545,9 @@ mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
struct hgcd_matrix *M, mp_ptr tp)
{
mp_size_t s = n/2 + 1;
+ mp_size_t n2 = (3*n)/4 + 1;
- mp_size_t nn;
+ mp_size_t p, nn;
int success = 0;
if (n <= s)
@@ -97,83 +559,65 @@ mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
ASSERT ((n+1)/2 - 1 < M->alloc);
- if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD))
- {
- mp_size_t n2 = (3*n)/4 + 1;
- mp_size_t p = n/2;
+ if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+ return mpn_hgcd_lehmer (ap, bp, n, M, tp);
- nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
- if (nn)
- {
- n = nn;
- success = 1;
- }
+ p = n/2;
+ nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
+ if (nn > 0)
+ {
+ /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+ = 2 (n - 1) */
+ n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+ success = 1;
+ }
+ while (n > n2)
+ {
+ /* Needs n + 1 storage */
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn)
+ return success ? n : 0;
+ n = nn;
+ success = 1;
+ }
- /* NOTE: It appears this loop never runs more than once (at
- least when not recursing to hgcd_appr). */
- while (n > n2)
- {
- /* Needs n + 1 storage */
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
- if (!nn)
- return success ? n : 0;
+ if (n > s + 2)
+ {
+ struct hgcd_matrix M1;
+ mp_size_t scratch;
- n = nn;
- success = 1;
- }
+ p = 2*s - n + 1;
+ scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
- if (n > s + 2)
+ mpn_hgcd_matrix_init(&M1, n - p, tp);
+ nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
+ if (nn > 0)
{
- struct hgcd_matrix M1;
- mp_size_t scratch;
-
- p = 2*s - n + 1;
- scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
-
- mpn_hgcd_matrix_init(&M1, n - p, tp);
-
- /* FIXME: Should use hgcd_reduce, but that may require more
- scratch space, which requires review. */
-
- nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
- if (nn > 0)
- {
- /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
- ASSERT (M->n + 2 >= M1.n);
-
- /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
- then either q or q + 1 is a correct quotient, and M1 will
- start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
- rules out the case that the size of M * M1 is much
- smaller than the expected M->n + M1->n. */
-
- ASSERT (M->n + M1.n < M->alloc);
-
- /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
- = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
- n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
-
- /* We need a bound for of M->n + M1.n. Let n be the original
- input size. Then
-
- ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
-
- and it follows that
-
- M.n + M1.n <= ceil(n/2) + 1
-
- Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
- amount of needed scratch space. */
- mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
- success = 1;
- }
+ /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+ ASSERT (M->n + 2 >= M1.n);
+
+ /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+ then either q or q + 1 is a correct quotient, and M1 will
+ start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+ rules out the case that the size of M * M1 is much
+ smaller than the expected M->n + M1->n. */
+
+ ASSERT (M->n + M1.n < M->alloc);
+
+ /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+ = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+ n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+ /* Needs 4 ceil(n/2) + 1 */
+ mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+ success = 1;
}
}
+ /* This really is the base case */
for (;;)
{
/* Needs s+3 < n */
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+ nn = hgcd_step (n, ap, bp, s, M, tp);
if (!nn)
return success ? n : 0;
diff --git a/gmp/mpn/generic/hgcd2.c b/gmp/mpn/generic/hgcd2.c
index 129637063f..ffc8c44f67 100644
--- a/gmp/mpn/generic/hgcd2.c
+++ b/gmp/mpn/generic/hgcd2.c
@@ -4,33 +4,23 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 1996, 1998, 2000-2004, 2008, 2012 Free Software Foundation, Inc.
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -209,7 +199,7 @@ div2 (mp_ptr rp,
/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
matrix M. Returns 1 if we make progress, i.e. can perform at least
- one subtraction. Otherwise returns zero. */
+ one subtraction. Otherwise returns zero.. */
/* FIXME: Possible optimizations:
@@ -348,6 +338,8 @@ mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
for (;;)
{
ASSERT (ah >= bh);
+ if (ah == bh)
+ break;
ah -= bh;
if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
@@ -377,6 +369,8 @@ mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
}
subtract_a1:
ASSERT (bh >= ah);
+ if (ah == bh)
+ break;
bh -= ah;
if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
@@ -445,3 +439,31 @@ mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
n += (ah | bh) > 0;
return n;
}
+
+/* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from
+ the left. Uses three buffers, to avoid a copy. */
+mp_size_t
+mpn_hgcd_mul_matrix1_inverse_vector (const struct hgcd_matrix1 *M,
+ mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
+{
+ mp_limb_t h0, h1;
+
+ /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
+
+ r = u11 * a
+ r -= u01 * b
+ b *= u00
+ b -= u10 * a
+ */
+
+ h0 = mpn_mul_1 (rp, ap, n, M->u[1][1]);
+ h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]);
+ ASSERT (h0 == h1);
+
+ h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]);
+ h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]);
+ ASSERT (h0 == h1);
+
+ n -= (rp[n-1] | bp[n-1]) == 0;
+ return n;
+}
diff --git a/gmp/mpn/generic/hgcd2_jacobi.c b/gmp/mpn/generic/hgcd2_jacobi.c
deleted file mode 100644
index e59c32a341..0000000000
--- a/gmp/mpn/generic/hgcd2_jacobi.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/* hgcd2_jacobi.c
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 1996, 1998, 2000-2004, 2008, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#if GMP_NAIL_BITS > 0
-#error Nails not supported.
-#endif
-
-/* FIXME: Duplicated in hgcd2.c. Should move to gmp-impl.h, and
- possibly be renamed. */
-static inline mp_limb_t
-div1 (mp_ptr rp,
- mp_limb_t n0,
- mp_limb_t d0)
-{
- mp_limb_t q = 0;
-
- if ((mp_limb_signed_t) n0 < 0)
- {
- int cnt;
- for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
- {
- d0 = d0 << 1;
- }
-
- q = 0;
- while (cnt)
- {
- q <<= 1;
- if (n0 >= d0)
- {
- n0 = n0 - d0;
- q |= 1;
- }
- d0 = d0 >> 1;
- cnt--;
- }
- }
- else
- {
- int cnt;
- for (cnt = 0; n0 >= d0; cnt++)
- {
- d0 = d0 << 1;
- }
-
- q = 0;
- while (cnt)
- {
- d0 = d0 >> 1;
- q <<= 1;
- if (n0 >= d0)
- {
- n0 = n0 - d0;
- q |= 1;
- }
- cnt--;
- }
- }
- *rp = n0;
- return q;
-}
-
-/* Two-limb division optimized for small quotients. */
-static inline mp_limb_t
-div2 (mp_ptr rp,
- mp_limb_t nh, mp_limb_t nl,
- mp_limb_t dh, mp_limb_t dl)
-{
- mp_limb_t q = 0;
-
- if ((mp_limb_signed_t) nh < 0)
- {
- int cnt;
- for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
- {
- dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
- dl = dl << 1;
- }
-
- while (cnt)
- {
- q <<= 1;
- if (nh > dh || (nh == dh && nl >= dl))
- {
- sub_ddmmss (nh, nl, nh, nl, dh, dl);
- q |= 1;
- }
- dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
- dh = dh >> 1;
- cnt--;
- }
- }
- else
- {
- int cnt;
- for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
- {
- dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
- dl = dl << 1;
- }
-
- while (cnt)
- {
- dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
- dh = dh >> 1;
- q <<= 1;
- if (nh > dh || (nh == dh && nl >= dl))
- {
- sub_ddmmss (nh, nl, nh, nl, dh, dl);
- q |= 1;
- }
- cnt--;
- }
- }
-
- rp[0] = nl;
- rp[1] = nh;
-
- return q;
-}
-
-int
-mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
- struct hgcd_matrix1 *M, unsigned *bitsp)
-{
- mp_limb_t u00, u01, u10, u11;
- unsigned bits = *bitsp;
-
- if (ah < 2 || bh < 2)
- return 0;
-
- if (ah > bh || (ah == bh && al > bl))
- {
- sub_ddmmss (ah, al, ah, al, bh, bl);
- if (ah < 2)
- return 0;
-
- u00 = u01 = u11 = 1;
- u10 = 0;
- bits = mpn_jacobi_update (bits, 1, 1);
- }
- else
- {
- sub_ddmmss (bh, bl, bh, bl, ah, al);
- if (bh < 2)
- return 0;
-
- u00 = u10 = u11 = 1;
- u01 = 0;
- bits = mpn_jacobi_update (bits, 0, 1);
- }
-
- if (ah < bh)
- goto subtract_a;
-
- for (;;)
- {
- ASSERT (ah >= bh);
- if (ah == bh)
- goto done;
-
- if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
- {
- ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
- bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
-
- break;
- }
-
- /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
- 1), affecting the second column of M. */
- ASSERT (ah > bh);
- sub_ddmmss (ah, al, ah, al, bh, bl);
-
- if (ah < 2)
- goto done;
-
- if (ah <= bh)
- {
- /* Use q = 1 */
- u01 += u00;
- u11 += u10;
- bits = mpn_jacobi_update (bits, 1, 1);
- }
- else
- {
- mp_limb_t r[2];
- mp_limb_t q = div2 (r, ah, al, bh, bl);
- al = r[0]; ah = r[1];
- if (ah < 2)
- {
- /* A is too small, but q is correct. */
- u01 += q * u00;
- u11 += q * u10;
- bits = mpn_jacobi_update (bits, 1, q & 3);
- goto done;
- }
- q++;
- u01 += q * u00;
- u11 += q * u10;
- bits = mpn_jacobi_update (bits, 1, q & 3);
- }
- subtract_a:
- ASSERT (bh >= ah);
- if (ah == bh)
- goto done;
-
- if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
- {
- ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
- bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
-
- goto subtract_a1;
- }
-
- /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
- 1), affecting the first column of M. */
- sub_ddmmss (bh, bl, bh, bl, ah, al);
-
- if (bh < 2)
- goto done;
-
- if (bh <= ah)
- {
- /* Use q = 1 */
- u00 += u01;
- u10 += u11;
- bits = mpn_jacobi_update (bits, 0, 1);
- }
- else
- {
- mp_limb_t r[2];
- mp_limb_t q = div2 (r, bh, bl, ah, al);
- bl = r[0]; bh = r[1];
- if (bh < 2)
- {
- /* B is too small, but q is correct. */
- u00 += q * u01;
- u10 += q * u11;
- bits = mpn_jacobi_update (bits, 0, q & 3);
- goto done;
- }
- q++;
- u00 += q * u01;
- u10 += q * u11;
- bits = mpn_jacobi_update (bits, 0, q & 3);
- }
- }
-
- /* NOTE: Since we discard the least significant half limb, we don't
- get a truly maximal M (corresponding to |a - b| <
- 2^{GMP_LIMB_BITS +1}). */
- /* Single precision loop */
- for (;;)
- {
- ASSERT (ah >= bh);
- if (ah == bh)
- break;
-
- ah -= bh;
- if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
- break;
-
- if (ah <= bh)
- {
- /* Use q = 1 */
- u01 += u00;
- u11 += u10;
- bits = mpn_jacobi_update (bits, 1, 1);
- }
- else
- {
- mp_limb_t r;
- mp_limb_t q = div1 (&r, ah, bh);
- ah = r;
- if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
- {
- /* A is too small, but q is correct. */
- u01 += q * u00;
- u11 += q * u10;
- bits = mpn_jacobi_update (bits, 1, q & 3);
- break;
- }
- q++;
- u01 += q * u00;
- u11 += q * u10;
- bits = mpn_jacobi_update (bits, 1, q & 3);
- }
- subtract_a1:
- ASSERT (bh >= ah);
- if (ah == bh)
- break;
-
- bh -= ah;
- if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
- break;
-
- if (bh <= ah)
- {
- /* Use q = 1 */
- u00 += u01;
- u10 += u11;
- bits = mpn_jacobi_update (bits, 0, 1);
- }
- else
- {
- mp_limb_t r;
- mp_limb_t q = div1 (&r, bh, ah);
- bh = r;
- if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
- {
- /* B is too small, but q is correct. */
- u00 += q * u01;
- u10 += q * u11;
- bits = mpn_jacobi_update (bits, 0, q & 3);
- break;
- }
- q++;
- u00 += q * u01;
- u10 += q * u11;
- bits = mpn_jacobi_update (bits, 0, q & 3);
- }
- }
-
- done:
- M->u[0][0] = u00; M->u[0][1] = u01;
- M->u[1][0] = u10; M->u[1][1] = u11;
- *bitsp = bits;
-
- return 1;
-}
diff --git a/gmp/mpn/generic/hgcd_appr.c b/gmp/mpn/generic/hgcd_appr.c
deleted file mode 100644
index 660219372f..0000000000
--- a/gmp/mpn/generic/hgcd_appr.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/* hgcd_appr.c.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
- HGCD_THRESHOLD at the end? */
-mp_size_t
-mpn_hgcd_appr_itch (mp_size_t n)
-{
- if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
- return n;
- else
- {
- unsigned k;
- int count;
- mp_size_t nscaled;
-
- /* Get the recursion depth. */
- nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1);
- count_leading_zeros (count, nscaled);
- k = GMP_LIMB_BITS - count;
-
- return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
- }
-}
-
-/* Destroys inputs. */
-int
-mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
- struct hgcd_matrix *M, mp_ptr tp)
-{
- mp_size_t s;
- int success = 0;
-
- ASSERT (n > 0);
-
- ASSERT ((ap[n-1] | bp[n-1]) != 0);
-
- if (n <= 2)
- /* Implies s = n. A fairly uninteresting case but exercised by the
- random inputs of the testsuite. */
- return 0;
-
- ASSERT ((n+1)/2 - 1 < M->alloc);
-
- /* We aim for reduction of to GMP_NUMB_BITS * s bits. But each time
- we discard some of the least significant limbs, we must keep one
- additional bit to account for the truncation error. We maintain
- the GMP_NUMB_BITS * s - extra_bits as the current target size. */
-
- s = n/2 + 1;
- if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
- {
- unsigned extra_bits = 0;
-
- while (n > 2)
- {
- mp_size_t nn;
-
- ASSERT (n > s);
- ASSERT (n <= 2*s);
-
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
- if (!nn)
- break;
-
- n = nn;
- success = 1;
-
- /* We can truncate and discard the lower p bits whenever nbits <=
- 2*sbits - p. To account for the truncation error, we must
- adjust
-
- sbits <-- sbits + 1 - p,
-
- rather than just sbits <-- sbits - p. This adjustment makes
- the produced matrix slightly smaller than it could be. */
-
- if (GMP_NUMB_BITS * (n + 1) + 2 * extra_bits <= 2*GMP_NUMB_BITS * s)
- {
- mp_size_t p = (GMP_NUMB_BITS * (2*s - n) - 2*extra_bits) / GMP_NUMB_BITS;
-
- if (extra_bits == 0)
- {
- /* We cross a limb boundary and bump s. We can't do that
- if the result is that it makes makes min(U, V)
- smaller than 2^{GMP_NUMB_BITS} s. */
- if (s + 1 == n
- || mpn_zero_p (ap + s + 1, n - s - 1)
- || mpn_zero_p (bp + s + 1, n - s - 1))
- continue;
-
- extra_bits = GMP_NUMB_BITS - 1;
- s++;
- }
- else
- {
- extra_bits--;
- }
-
- /* Drop the p least significant limbs */
- ap += p; bp += p; n -= p; s -= p;
- }
- }
-
- ASSERT (s > 0);
-
- if (extra_bits > 0)
- {
- /* We can get here only of we have dropped at least one of the least
- significant bits, so we can decrement ap and bp. We can then shift
- left extra bits using mpn_rshift. */
- /* NOTE: In the unlikely case that n is large, it would be preferable
- to do an initial subdiv step to reduce the size before shifting,
- but that would mean duplicating mpn_gcd_subdiv_step with a bit
- count rather than a limb count. */
- ap--; bp--;
- ap[0] = mpn_rshift (ap+1, ap+1, n, GMP_NUMB_BITS - extra_bits);
- bp[0] = mpn_rshift (bp+1, bp+1, n, GMP_NUMB_BITS - extra_bits);
- n += (ap[n] | bp[n]) > 0;
-
- ASSERT (success);
-
- while (n > 2)
- {
- mp_size_t nn;
-
- ASSERT (n > s);
- ASSERT (n <= 2*s);
-
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
- if (!nn)
- return 1;
-
- n = nn;
- }
- }
-
- if (n == 2)
- {
- struct hgcd_matrix1 M1;
- ASSERT (s == 1);
-
- if (mpn_hgcd2 (ap[1], ap[0], bp[1], bp[0], &M1))
- {
- /* Multiply M <- M * M1 */
- mpn_hgcd_matrix_mul_1 (M, &M1, tp);
- success = 1;
- }
- }
- return success;
- }
- else
- {
- mp_size_t n2 = (3*n)/4 + 1;
- mp_size_t p = n/2;
- mp_size_t nn;
-
- nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
- if (nn)
- {
- n = nn;
- /* FIXME: Discard some of the low limbs immediately? */
- success = 1;
- }
-
- while (n > n2)
- {
- mp_size_t nn;
-
- /* Needs n + 1 storage */
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
- if (!nn)
- return success;
-
- n = nn;
- success = 1;
- }
- if (n > s + 2)
- {
- struct hgcd_matrix M1;
- mp_size_t scratch;
-
- p = 2*s - n + 1;
- scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
-
- mpn_hgcd_matrix_init(&M1, n - p, tp);
- if (mpn_hgcd_appr (ap + p, bp + p, n - p, &M1, tp + scratch))
- {
- /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
- ASSERT (M->n + 2 >= M1.n);
-
- /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
- then either q or q + 1 is a correct quotient, and M1 will
- start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
- rules out the case that the size of M * M1 is much
- smaller than the expected M->n + M1->n. */
-
- ASSERT (M->n + M1.n < M->alloc);
-
- /* We need a bound for of M->n + M1.n. Let n be the original
- input size. Then
-
- ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
-
- and it follows that
-
- M.n + M1.n <= ceil(n/2) + 1
-
- Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
- amount of needed scratch space. */
- mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
- return 1;
- }
- }
-
- for(;;)
- {
- mp_size_t nn;
-
- ASSERT (n > s);
- ASSERT (n <= 2*s);
-
- nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
- if (!nn)
- return success;
-
- n = nn;
- success = 1;
- }
- }
-}
diff --git a/gmp/mpn/generic/hgcd_jacobi.c b/gmp/mpn/generic/hgcd_jacobi.c
deleted file mode 100644
index 0a49e5b3a7..0000000000
--- a/gmp/mpn/generic/hgcd_jacobi.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/* hgcd_jacobi.c.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* This file is almost a copy of hgcd.c, with some added calls to
- mpn_jacobi_update */
-
-struct hgcd_jacobi_ctx
-{
- struct hgcd_matrix *M;
- unsigned *bitsp;
-};
-
-static void
-hgcd_jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn,
- mp_srcptr qp, mp_size_t qn, int d)
-{
- ASSERT (!gp);
- ASSERT (d >= 0);
-
- MPN_NORMALIZE (qp, qn);
- if (qn > 0)
- {
- struct hgcd_jacobi_ctx *ctx = (struct hgcd_jacobi_ctx *) p;
- /* NOTES: This is a bit ugly. A tp area is passed to
- gcd_subdiv_step, which stores q at the start of that area. We
- now use the rest. */
- mp_ptr tp = (mp_ptr) qp + qn;
-
- mpn_hgcd_matrix_update_q (ctx->M, qp, qn, d, tp);
- *ctx->bitsp = mpn_jacobi_update (*ctx->bitsp, d, qp[0] & 3);
- }
-}
-
-/* Perform a few steps, using some of mpn_hgcd2, subtraction and
- division. Reduces the size by almost one limb or more, but never
- below the given size s. Return new size for a and b, or 0 if no
- more steps are possible.
-
- If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n
- limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
- fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
- hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
- resulting size of M.
-
- If N is the input size to the calling hgcd, then s = floor(N/2) +
- 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
- < N, so N is sufficient.
-*/
-
-static mp_size_t
-hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
- struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
-{
- struct hgcd_matrix1 M1;
- mp_limb_t mask;
- mp_limb_t ah, al, bh, bl;
-
- ASSERT (n > s);
-
- mask = ap[n-1] | bp[n-1];
- ASSERT (mask > 0);
-
- if (n == s + 1)
- {
- if (mask < 4)
- goto subtract;
-
- ah = ap[n-1]; al = ap[n-2];
- bh = bp[n-1]; bl = bp[n-2];
- }
- else if (mask & GMP_NUMB_HIGHBIT)
- {
- ah = ap[n-1]; al = ap[n-2];
- bh = bp[n-1]; bl = bp[n-2];
- }
- else
- {
- int shift;
-
- count_leading_zeros (shift, mask);
- ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
- al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
- bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
- bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
- }
-
- /* Try an mpn_hgcd2 step */
- if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M1, bitsp))
- {
- /* Multiply M <- M * M1 */
- mpn_hgcd_matrix_mul_1 (M, &M1, tp);
-
- /* Can't swap inputs, so we need to copy. */
- MPN_COPY (tp, ap, n);
- /* Multiply M1^{-1} (a;b) */
- return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
- }
-
- subtract:
- {
- struct hgcd_jacobi_ctx ctx;
- ctx.M = M;
- ctx.bitsp = bitsp;
-
- return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
- }
-}
-
-/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
- with elements of size at most (n+1)/2 - 1. Returns new size of a,
- b, or zero if no reduction is possible. */
-
-/* Same scratch requirements as for mpn_hgcd. */
-mp_size_t
-mpn_hgcd_jacobi (mp_ptr ap, mp_ptr bp, mp_size_t n,
- struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
-{
- mp_size_t s = n/2 + 1;
-
- mp_size_t nn;
- int success = 0;
-
- if (n <= s)
- /* Happens when n <= 2, a fairly uninteresting case but exercised
- by the random inputs of the testsuite. */
- return 0;
-
- ASSERT ((ap[n-1] | bp[n-1]) > 0);
-
- ASSERT ((n+1)/2 - 1 < M->alloc);
-
- if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD))
- {
- mp_size_t n2 = (3*n)/4 + 1;
- mp_size_t p = n/2;
-
- nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, M, bitsp, tp);
- if (nn > 0)
- {
- /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
- = 2 (n - 1) */
- n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
- success = 1;
- }
- while (n > n2)
- {
- /* Needs n + 1 storage */
- nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
- if (!nn)
- return success ? n : 0;
- n = nn;
- success = 1;
- }
-
- if (n > s + 2)
- {
- struct hgcd_matrix M1;
- mp_size_t scratch;
-
- p = 2*s - n + 1;
- scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
-
- mpn_hgcd_matrix_init(&M1, n - p, tp);
- nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M1, bitsp, tp + scratch);
- if (nn > 0)
- {
- /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
- ASSERT (M->n + 2 >= M1.n);
-
- /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
- then either q or q + 1 is a correct quotient, and M1 will
- start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
- rules out the case that the size of M * M1 is much
- smaller than the expected M->n + M1->n. */
-
- ASSERT (M->n + M1.n < M->alloc);
-
- /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
- = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
- n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
-
- /* We need a bound for of M->n + M1.n. Let n be the original
- input size. Then
-
- ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
-
- and it follows that
-
- M.n + M1.n <= ceil(n/2) + 1
-
- Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
- amount of needed scratch space. */
- mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
- success = 1;
- }
- }
- }
-
- for (;;)
- {
- /* Needs s+3 < n */
- nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
- if (!nn)
- return success ? n : 0;
-
- n = nn;
- success = 1;
- }
-}
diff --git a/gmp/mpn/generic/hgcd_matrix.c b/gmp/mpn/generic/hgcd_matrix.c
deleted file mode 100644
index d9db331603..0000000000
--- a/gmp/mpn/generic/hgcd_matrix.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/* hgcd_matrix.c.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2003-2005, 2008, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* For input of size n, matrix elements are of size at most ceil(n/2)
- - 1, but we need two limbs extra. */
-void
-mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
-{
- mp_size_t s = (n+1)/2 + 1;
- M->alloc = s;
- M->n = 1;
- MPN_ZERO (p, 4 * s);
- M->p[0][0] = p;
- M->p[0][1] = p + s;
- M->p[1][0] = p + 2 * s;
- M->p[1][1] = p + 3 * s;
-
- M->p[0][0][0] = M->p[1][1][0] = 1;
-}
-
-/* Update column COL, adding in Q * column (1-COL). Temporary storage:
- * qn + n <= M->alloc, where n is the size of the largest element in
- * column 1 - COL. */
-void
-mpn_hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
- unsigned col, mp_ptr tp)
-{
- ASSERT (col < 2);
-
- if (qn == 1)
- {
- mp_limb_t q = qp[0];
- mp_limb_t c0, c1;
-
- c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
- c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
-
- M->p[0][col][M->n] = c0;
- M->p[1][col][M->n] = c1;
-
- M->n += (c0 | c1) != 0;
- }
- else
- {
- unsigned row;
-
- /* Carries for the unlikely case that we get both high words
- from the multiplication and carries from the addition. */
- mp_limb_t c[2];
- mp_size_t n;
-
- /* The matrix will not necessarily grow in size by qn, so we
- need normalization in order not to overflow M. */
-
- for (n = M->n; n + qn > M->n; n--)
- {
- ASSERT (n > 0);
- if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
- break;
- }
-
- ASSERT (qn + n <= M->alloc);
-
- for (row = 0; row < 2; row++)
- {
- if (qn <= n)
- mpn_mul (tp, M->p[row][1-col], n, qp, qn);
- else
- mpn_mul (tp, qp, qn, M->p[row][1-col], n);
-
- ASSERT (n + qn >= M->n);
- c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
- }
-
- n += qn;
-
- if (c[0] | c[1])
- {
- M->p[0][col][n] = c[0];
- M->p[1][col][n] = c[1];
- n++;
- }
- else
- {
- n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
- ASSERT (n >= M->n);
- }
- M->n = n;
- }
-
- ASSERT (M->n < M->alloc);
-}
-
-/* Multiply M by M1 from the right. Since the M1 elements fit in
- GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
- temporary space M->n */
-void
-mpn_hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
- mp_ptr tp)
-{
- mp_size_t n0, n1;
-
- /* Could avoid copy by some swapping of pointers. */
- MPN_COPY (tp, M->p[0][0], M->n);
- n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
- MPN_COPY (tp, M->p[1][0], M->n);
- n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
-
- /* Depends on zero initialization */
- M->n = MAX(n0, n1);
- ASSERT (M->n < M->alloc);
-}
-
-/* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs
- of temporary storage (see mpn_matrix22_mul_itch). */
-void
-mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
- mp_ptr tp)
-{
- mp_size_t n;
-
- /* About the new size of M:s elements. Since M1's diagonal elements
- are > 0, no element can decrease. The new elements are of size
- M->n + M1->n, one limb more or less. The computation of the
- matrix product produces elements of size M->n + M1->n + 1. But
- the true size, after normalization, may be three limbs smaller.
-
- The reason that the product has normalized size >= M->n + M1->n -
- 2 is subtle. It depends on the fact that M and M1 can be factored
- as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have
- M ending with a large power and M1 starting with a large power of
- the same matrix. */
-
- /* FIXME: Strassen multiplication gives only a small speedup. In FFT
- multiplication range, this function could be sped up quite a lot
- using invariance. */
- ASSERT (M->n + M1->n < M->alloc);
-
- ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
- | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
-
- ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
- | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
-
- mpn_matrix22_mul (M->p[0][0], M->p[0][1],
- M->p[1][0], M->p[1][1], M->n,
- M1->p[0][0], M1->p[0][1],
- M1->p[1][0], M1->p[1][1], M1->n, tp);
-
- /* Index of last potentially non-zero limb, size is one greater. */
- n = M->n + M1->n;
-
- n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
- n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
- n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
-
- ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
-
- M->n = n + 1;
-}
-
-/* Multiplies the least significant p limbs of (a;b) by M^-1.
- Temporary space needed: 2 * (p + M->n)*/
-mp_size_t
-mpn_hgcd_matrix_adjust (const struct hgcd_matrix *M,
- mp_size_t n, mp_ptr ap, mp_ptr bp,
- mp_size_t p, mp_ptr tp)
-{
- /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
- = (r11 a - r01 b; - r10 a + r00 b */
-
- mp_ptr t0 = tp;
- mp_ptr t1 = tp + p + M->n;
- mp_limb_t ah, bh;
- mp_limb_t cy;
-
- ASSERT (p + M->n < n);
-
- /* First compute the two values depending on a, before overwriting a */
-
- if (M->n >= p)
- {
- mpn_mul (t0, M->p[1][1], M->n, ap, p);
- mpn_mul (t1, M->p[1][0], M->n, ap, p);
- }
- else
- {
- mpn_mul (t0, ap, p, M->p[1][1], M->n);
- mpn_mul (t1, ap, p, M->p[1][0], M->n);
- }
-
- /* Update a */
- MPN_COPY (ap, t0, p);
- ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
-
- if (M->n >= p)
- mpn_mul (t0, M->p[0][1], M->n, bp, p);
- else
- mpn_mul (t0, bp, p, M->p[0][1], M->n);
-
- cy = mpn_sub (ap, ap, n, t0, p + M->n);
- ASSERT (cy <= ah);
- ah -= cy;
-
- /* Update b */
- if (M->n >= p)
- mpn_mul (t0, M->p[0][0], M->n, bp, p);
- else
- mpn_mul (t0, bp, p, M->p[0][0], M->n);
-
- MPN_COPY (bp, t0, p);
- bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
- cy = mpn_sub (bp, bp, n, t1, p + M->n);
- ASSERT (cy <= bh);
- bh -= cy;
-
- if (ah > 0 || bh > 0)
- {
- ap[n] = ah;
- bp[n] = bh;
- n++;
- }
- else
- {
- /* The subtraction can reduce the size by at most one limb. */
- if (ap[n-1] == 0 && bp[n-1] == 0)
- n--;
- }
- ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
- return n;
-}
diff --git a/gmp/mpn/generic/hgcd_reduce.c b/gmp/mpn/generic/hgcd_reduce.c
deleted file mode 100644
index 6f3d61ecea..0000000000
--- a/gmp/mpn/generic/hgcd_reduce.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/* hgcd_reduce.c.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Computes R -= A * B. Result must be non-negative. Normalized down
- to size an, and resulting size is returned. */
-static mp_size_t
-submul (mp_ptr rp, mp_size_t rn,
- mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
-{
- mp_ptr tp;
- TMP_DECL;
-
- ASSERT (bn > 0);
- ASSERT (an >= bn);
- ASSERT (rn >= an);
- ASSERT (an + bn <= rn + 1);
-
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS (an + bn);
-
- mpn_mul (tp, ap, an, bp, bn);
- if (an + bn > rn)
- {
- ASSERT (tp[rn] == 0);
- bn--;
- }
- ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn));
- TMP_FREE;
-
- while (rn > an && (rp[rn-1] == 0))
- rn--;
-
- return rn;
-}
-
-/* Computes (a, b) <-- M^{-1} (a; b) */
-/* FIXME:
- x Take scratch parameter, and figure out scratch need.
-
- x Use some fallback for small M->n?
-*/
-static mp_size_t
-hgcd_matrix_apply (const struct hgcd_matrix *M,
- mp_ptr ap, mp_ptr bp,
- mp_size_t n)
-{
- mp_size_t an, bn, un, vn, nn;
- mp_size_t mn[2][2];
- mp_size_t modn;
- mp_ptr tp, sp, scratch;
- mp_limb_t cy;
- unsigned i, j;
-
- TMP_DECL;
-
- ASSERT ( (ap[n-1] | bp[n-1]) > 0);
-
- an = n;
- MPN_NORMALIZE (ap, an);
- bn = n;
- MPN_NORMALIZE (bp, bn);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- mp_size_t k;
- k = M->n;
- MPN_NORMALIZE (M->p[i][j], k);
- mn[i][j] = k;
- }
-
- ASSERT (mn[0][0] > 0);
- ASSERT (mn[1][1] > 0);
- ASSERT ( (mn[0][1] | mn[1][0]) > 0);
-
- TMP_MARK;
-
- if (mn[0][1] == 0)
- {
- /* A unchanged, M = (1, 0; q, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put B <-- B - q A */
- nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
- }
- else if (mn[1][0] == 0)
- {
- /* B unchanged, M = (1, q; 0, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put A <-- A - q * B */
- nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
- }
- else
- {
- /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01.
- B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */
- un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
- vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
-
- nn = MAX (un, vn);
- /* In the range of interest, mulmod_bnm1 should always beat mullo. */
- modn = mpn_mulmod_bnm1_next_size (nn + 1);
-
- scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
- tp = TMP_ALLOC_LIMBS (modn);
- sp = TMP_ALLOC_LIMBS (modn);
-
- ASSERT (n <= 2*modn);
-
- if (n > modn)
- {
- cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
- MPN_INCR_U (ap, modn, cy);
-
- cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
- MPN_INCR_U (bp, modn, cy);
-
- n = modn;
- }
-
- mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
- mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
-
- /* FIXME: Handle the small n case in some better way. */
- if (n + mn[1][1] < modn)
- MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
- if (n + mn[0][1] < modn)
- MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
-
- mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
- MPN_COPY (ap, tp, nn);
- mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
-
- if (n + mn[1][0] < modn)
- MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
- if (n + mn[0][0] < modn)
- MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
- MPN_COPY (bp, tp, nn);
-
- while ( (ap[nn-1] | bp[nn-1]) == 0)
- {
- nn--;
- ASSERT (nn > 0);
- }
- }
- TMP_FREE;
-
- return nn;
-}
-
-mp_size_t
-mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
-{
- mp_size_t itch;
- if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
- {
- itch = mpn_hgcd_itch (n-p);
-
- /* For arbitrary p, the storage for _adjust is 2*(p + M->n) = 2 *
- (p + ceil((n-p)/2) - 1 <= n + p - 1 */
- if (itch < n + p - 1)
- itch = n + p - 1;
- }
- else
- {
- itch = 2*(n-p) + mpn_hgcd_itch (n-p);
- /* Currently, hgcd_matrix_apply allocates its own storage. */
- }
- return itch;
-}
-
-/* FIXME: Document storage need. */
-mp_size_t
-mpn_hgcd_reduce (struct hgcd_matrix *M,
- mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
- mp_ptr tp)
-{
- mp_size_t nn;
- if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
- {
- nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
- if (nn > 0)
- /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
- = 2 (n - 1) */
- return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
- }
- else
- {
- MPN_COPY (tp, ap + p, n - p);
- MPN_COPY (tp + n - p, bp + p, n - p);
- if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
- return hgcd_matrix_apply (M, ap, bp, n);
- }
- return 0;
-}
diff --git a/gmp/mpn/generic/hgcd_step.c b/gmp/mpn/generic/hgcd_step.c
deleted file mode 100644
index e58894ff3b..0000000000
--- a/gmp/mpn/generic/hgcd_step.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/* hgcd_step.c.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-static void
-hgcd_hook (void *p, mp_srcptr gp, mp_size_t gn,
- mp_srcptr qp, mp_size_t qn, int d)
-{
- ASSERT (!gp);
- ASSERT (d >= 0);
- ASSERT (d <= 1);
-
- MPN_NORMALIZE (qp, qn);
- if (qn > 0)
- {
- struct hgcd_matrix *M = (struct hgcd_matrix *) p;
- /* NOTES: This is a bit ugly. A tp area is passed to
- gcd_subdiv_step, which stores q at the start of that area. We
- now use the rest. */
- mp_ptr tp = (mp_ptr) qp + qn;
- mpn_hgcd_matrix_update_q (M, qp, qn, d, tp);
- }
-}
-
-/* Perform a few steps, using some of mpn_hgcd2, subtraction and
- division. Reduces the size by almost one limb or more, but never
- below the given size s. Return new size for a and b, or 0 if no
- more steps are possible.
-
- If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n
- limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
- fails, needs space for the quotient, qn <= n - s limbs, for and
- hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
- (resulting size of M) + 1.
-
- If N is the input size to the calling hgcd, then s = floor(N/2) +
- 1, M->n < N, qn + product size <= n - s + n - s + 1 = 2 (n - s) + 1
- <= N.
-*/
-
-mp_size_t
-mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
- struct hgcd_matrix *M, mp_ptr tp)
-{
- struct hgcd_matrix1 M1;
- mp_limb_t mask;
- mp_limb_t ah, al, bh, bl;
-
- ASSERT (n > s);
-
- mask = ap[n-1] | bp[n-1];
- ASSERT (mask > 0);
-
- if (n == s + 1)
- {
- if (mask < 4)
- goto subtract;
-
- ah = ap[n-1]; al = ap[n-2];
- bh = bp[n-1]; bl = bp[n-2];
- }
- else if (mask & GMP_NUMB_HIGHBIT)
- {
- ah = ap[n-1]; al = ap[n-2];
- bh = bp[n-1]; bl = bp[n-2];
- }
- else
- {
- int shift;
-
- count_leading_zeros (shift, mask);
- ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
- al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
- bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
- bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
- }
-
- /* Try an mpn_hgcd2 step */
- if (mpn_hgcd2 (ah, al, bh, bl, &M1))
- {
- /* Multiply M <- M * M1 */
- mpn_hgcd_matrix_mul_1 (M, &M1, tp);
-
- /* Can't swap inputs, so we need to copy. */
- MPN_COPY (tp, ap, n);
- /* Multiply M1^{-1} (a;b) */
- return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
- }
-
- subtract:
-
- return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp);
-}
diff --git a/gmp/mpn/generic/invert.c b/gmp/mpn/generic/invert.c
index 4bc459d728..e40d3611e6 100644
--- a/gmp/mpn/generic/invert.c
+++ b/gmp/mpn/generic/invert.c
@@ -1,91 +1,60 @@
-/* invert.c -- Compute floor((B^{2n}-1)/U) - B^n.
+/* Compute {up,n}^(-1).
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright (C) 2007, 2009, 2010, 2012 Free Software Foundation, Inc.
+Copyright (C) 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+#include <stdlib.h>
#include "gmp.h"
#include "gmp-impl.h"
-#include "longlong.h"
+
+/* Formulas:
+ z = 2z-(zz)d
+ z = 2z-(zd)z
+ z = z(2-zd)
+ z = z-z*(zd-1)
+ z = z+z*(1-zd)
+*/
+
+mp_size_t
+mpn_invert_itch (mp_size_t n)
+{
+ return 3 * n + 2;
+}
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
- ASSERT (n > 0);
- ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
- ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
- ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
- ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
-
- if (n == 1)
- invert_limb (*ip, *dp);
- else {
- TMP_DECL;
-
- TMP_MARK;
- if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
- {
- /* Maximum scratch needed by this branch: 2*n */
- mp_size_t i;
- mp_ptr xp;
-
- xp = scratch; /* 2 * n limbs */
- for (i = n - 1; i >= 0; i--)
- xp[i] = GMP_NUMB_MAX;
- mpn_com (xp + n, dp, n);
- if (n == 2) {
- mpn_divrem_2 (ip, 0, xp, 4, dp);
- } else {
- gmp_pi1_t inv;
- invert_pi1 (inv, dp[n-1], dp[n-2]);
- /* FIXME: should we use dcpi1_div_q, for big sizes? */
- mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
- }
- }
- else { /* Use approximated inverse; correct the result if needed. */
- mp_limb_t e; /* The possible error in the approximate inverse */
+ mp_ptr np, rp;
+ mp_size_t i;
+ TMP_DECL;
+
+ TMP_MARK;
+ if (scratch == NULL)
+ {
+ scratch = TMP_ALLOC_LIMBS (mpn_invert_itch (n));
+ }
- ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
- e = mpn_ni_invertappr (ip, dp, n, scratch);
+ np = scratch; /* 2 * n limbs */
+ rp = scratch + 2 * n; /* n + 2 limbs */
+ for (i = n - 1; i >= 0; i--)
+ np[i] = ~CNST_LIMB(0);
+ mpn_com_n (np + n, dp, n);
+ mpn_tdiv_qr (rp, ip, 0L, np, 2 * n, dp, n);
+ MPN_COPY (ip, rp, n);
- if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
- /* Code to detect and correct the "off by one" approximation. */
- mpn_mul_n (scratch, ip, dp, n);
- ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n));
- if (! mpn_add (scratch, scratch, 2*n, dp, n))
- MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it. */
- }
- }
- TMP_FREE;
- }
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/invertappr.c b/gmp/mpn/generic/invertappr.c
deleted file mode 100644
index 12326b8b75..0000000000
--- a/gmp/mpn/generic/invertappr.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/* mpn_invertappr and helper functions. Compute I such that
- floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
-
- Contributed to the GNU project by Marco Bodrato.
-
- The algorithm used here was inspired by ApproximateReciprocal from "Modern
- Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special
- thanks to Paul Zimmermann for his very valuable suggestions on all the
- theoretical aspects during the work on this code.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright (C) 2007, 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/* FIXME: Remove NULL and TMP_*, as soon as all the callers properly
- allocate and pass the scratch to the function. */
-#include <stdlib.h> /* for NULL */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* FIXME: The iterative version splits the operand in two slightly unbalanced
- parts, the use of log_2 (or counting the bits) underestimate the maximum
- number of iterations. */
-
-#if TUNE_PROGRAM_BUILD
-#define NPOWS \
- ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
-#define MAYBE_dcpi1_divappr 1
-#else
-#define NPOWS \
- ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
-#define MAYBE_dcpi1_divappr \
- (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
-#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
- (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
-#undef INV_MULMOD_BNM1_THRESHOLD
-#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
-#endif
-#endif
-
-/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
- the strictly normalised value {dp,n} (i.e., most significant bit must be set)
- as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
-
- Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
- following conditions are satisfied by the output:
- 0 <= e <= 1;
- {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
- I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
- e=1 means that the result _may_ be one less than expected.
-
- The _bc version returns e=1 most of the time.
- The _ni version should return e=0 most of the time; only about 1% of
- possible random input should give e=1.
-
- When the strict result is needed, i.e., e=0 in the relation above:
- {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
- the function mpn_invert (ip, dp, n, scratch) should be used instead. */
-
-/* Maximum scratch needed by this branch (at tp): 3*n + 2 */
-static mp_limb_t
-mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr tp)
-{
- mp_ptr xp;
-
- ASSERT (n > 0);
- ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
- ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
- ASSERT (! MPN_OVERLAP_P (ip, n, tp, mpn_invertappr_itch(n)));
- ASSERT (! MPN_OVERLAP_P (dp, n, tp, mpn_invertappr_itch(n)));
-
- /* Compute a base value of r limbs. */
- if (n == 1)
- invert_limb (*ip, *dp);
- else {
- mp_size_t i;
- xp = tp + n + 2; /* 2 * n limbs */
-
- for (i = n - 1; i >= 0; i--)
- xp[i] = GMP_NUMB_MAX;
- mpn_com (xp + n, dp, n);
-
- /* Now xp contains B^2n - {dp,n}*B^n - 1 */
-
- /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
- if (n == 2) {
- mpn_divrem_2 (ip, 0, xp, 4, dp);
- } else {
- gmp_pi1_t inv;
- invert_pi1 (inv, dp[n-1], dp[n-2]);
- if (! MAYBE_dcpi1_divappr
- || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
- mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
- else
- mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
- MPN_DECR_U(ip, n, 1);
- return 1;
- }
- }
- return 0;
-}
-
-/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
- iterations (at least one).
-
- Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
- Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
- in version 0.4 of the book.
-
- Some adaptations were introduced, to allow product mod B^m-1 and return the
- value e.
-
- USE_MUL_N = 1 (default) introduces a correction in such a way that "the
- value of B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
- "2B^n-1"). This correction should not require to modify the proof.
-
- We use a wrapped product modulo B^m-1. NOTE: is there any normalisation
- problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
- B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can
- happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
- B^{n+h} - A, then we get into the "negative" branch, where X_h is not
- incremented (because A < B^n).
-
- FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
- is allocated apart. */
-
-#define USE_MUL_N 1
-
-mp_limb_t
-mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
-{
- mp_limb_t cy;
- mp_ptr xp;
- mp_size_t rn, mn;
- mp_size_t sizes[NPOWS], *sizp;
- mp_ptr tp;
- TMP_DECL;
-#define rp scratch
-
- ASSERT (n > 2);
- ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
- ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
- ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
- ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
-
- /* Compute the computation precisions from highest to lowest, leaving the
- base case size in 'rn'. */
- sizp = sizes;
- rn = n;
- do {
- *sizp = rn;
- rn = ((rn) >> 1) + 1;
- sizp ++;
- } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
-
- /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
- dp += n;
- ip += n;
-
- /* Compute a base value of rn limbs. */
- mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
-
- TMP_MARK;
-
- if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
- {
- mn = mpn_mulmod_bnm1_next_size (n + 1);
- tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
- }
- /* Use Newton's iterations to get the desired precision.*/
-
- /* define rp scratch; 2rn + 1 limbs <= 2(n>>1 + 1) + 1 <= n + 3 limbs */
- /* Maximum scratch needed by this branch <= 3*n + 2 */
- xp = scratch + n + 3; /* n + rn limbs */
- while (1) {
- mp_limb_t method;
-
- n = *--sizp;
- /*
- v n v
- +----+--+
- ^ rn ^
- */
-
- /* Compute i_jd . */
- if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
- || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
- /* FIXME: We do only need {xp,n+1}*/
- mpn_mul (xp, dp - n, n, ip - rn, rn);
- mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
- method = 1; /* Remember we used (truncated) product */
- /* We computed cy.{xp,rn+n} <- 1.{ip,rn} * 0.{dp,n} */
- } else { /* Use B^n-1 wraparound */
- mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
- /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
- /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
- /* Add dp*B^rn mod (B^mn-1) */
- ASSERT (n >= mn - rn);
- xp[mn] = 1 + mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
- cy = mpn_add_n (xp, xp, dp - (n - (mn - rn)), n - (mn - rn));
- MPN_INCR_U (xp + n - (mn - rn), mn + 1 - n + (mn - rn), cy);
- ASSERT (n + rn >= mn);
- /* Subtract B^{rn+n} */
- MPN_DECR_U (xp + rn + n - mn, 2*mn + 1 - rn - n, 1);
- if (xp[mn])
- MPN_INCR_U (xp, mn, xp[mn] - 1);
- else
- MPN_DECR_U (xp, mn, 1);
- method = 0; /* Remember we are working Mod B^m-1 */
- }
-
- if (xp[n] < 2) { /* "positive" residue class */
- cy = 1;
- while (xp[n] || mpn_cmp (xp, dp - n, n)>0) {
- xp[n] -= mpn_sub_n (xp, xp, dp - n, n);
- cy ++;
- }
- MPN_DECR_U(ip - rn, rn, cy);
- ASSERT (cy <= 4); /* at most 3 cycles for the while above */
- ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n));
- ASSERT (xp[n] == 0);
- } else { /* "negative" residue class */
- mpn_com (xp, xp, n + 1);
- MPN_INCR_U(xp, n + 1, method);
- ASSERT (xp[n] <= 1);
-#if USE_MUL_N
- if (xp[n]) {
- MPN_INCR_U(ip - rn, rn, 1);
- ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
- }
-#endif
- }
-
- /* Compute x_ju_j. FIXME:We need {rp+rn,rn}, mulhi? */
-#if USE_MUL_N
- mpn_mul_n (rp, xp + n - rn, ip - rn, rn);
-#else
- rp[2*rn] = 0;
- mpn_mul (rp, xp + n - rn, rn + xp[n], ip - rn, rn);
-#endif
- /* We need _only_ the carry from the next addition */
- /* Anyway 2rn-n <= 2... we don't need to optimise. */
- cy = mpn_add_n (rp + rn, rp + rn, xp + n - rn, 2*rn - n);
- cy = mpn_add_nc (ip - n, rp + 3*rn - n, xp + rn, n - rn, cy);
- MPN_INCR_U (ip - rn, rn, cy + (1-USE_MUL_N)*(rp[2*rn] + xp[n]));
- if (sizp == sizes) { /* Get out of the cycle */
- /* Check for possible carry propagation from below. */
- cy = rp[3*rn - n - 1] > GMP_NUMB_MAX - 7; /* Be conservative. */
-/* cy = mpn_add_1 (rp + rn, rp + rn, 2*rn - n, 4); */
- break;
- }
- rn = n;
- }
- TMP_FREE;
-
- return cy;
-#undef rp
-}
-
-mp_limb_t
-mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
-{
- mp_limb_t res;
- TMP_DECL;
-
- TMP_MARK;
-
- if (scratch == NULL)
- scratch = TMP_ALLOC_LIMBS (mpn_invertappr_itch (n));
-
- ASSERT (n > 0);
- ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
- ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
- ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
- ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
-
- if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
- res = mpn_bc_invertappr (ip, dp, n, scratch);
- else
- res = mpn_ni_invertappr (ip, dp, n, scratch);
-
- TMP_FREE;
- return res;
-}
diff --git a/gmp/mpn/generic/jacbase.c b/gmp/mpn/generic/jacbase.c
index cd52bc9513..6972a130d9 100644
--- a/gmp/mpn/generic/jacbase.c
+++ b/gmp/mpn/generic/jacbase.c
@@ -3,33 +3,22 @@
THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP.
-Copyright 1999-2002, 2010 Free Software Foundation, Inc.
+Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -83,15 +72,15 @@ see https://www.gnu.org/licenses/. */
#define PROCESS_TWOS_EVEN \
{ \
int two, mask, shift; \
- \
+ \
two = JACOBI_TWO_U_BIT1 (b); \
mask = (~a & 2); \
a >>= 1; \
- \
+ \
shift = (~a & 1); \
a >>= shift; \
result_bit1 ^= two ^ (two & mask); \
- \
+ \
while ((a & 1) == 0) \
{ \
a >>= 1; \
@@ -102,14 +91,14 @@ see https://www.gnu.org/licenses/. */
#define PROCESS_TWOS_ANY \
{ \
int two, mask, shift; \
- \
+ \
two = JACOBI_TWO_U_BIT1 (b); \
shift = (~a & 1); \
a >>= shift; \
- \
+ \
mask = shift << 1; \
result_bit1 ^= (two & mask); \
- \
+ \
while ((a & 1) == 0) \
{ \
a >>= 1; \
@@ -119,9 +108,9 @@ see https://www.gnu.org/licenses/. */
}
#endif
-#if JACOBI_BASE_METHOD < 4
+
/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
- with a restricted range of inputs accepted, namely b>1, b odd.
+ with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
The initial result_bit1 is taken as a parameter for the convenience of
mpz_kronecker_ui() et al. The sign changes both here and in those
@@ -133,13 +122,17 @@ see https://www.gnu.org/licenses/. */
Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
possible, but a couple of tests suggest it's not a significant speedup,
- and may even be a slowdown, so what's here is good enough for now. */
+ and may even be a slowdown, so what's here is good enough for now.
+
+ Future: The code doesn't demand a<=b actually, so maybe this could be
+ relaxed. All the places this is used currently call with a<=b though. */
int
mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
{
ASSERT (b & 1); /* b odd */
ASSERT (b != 1);
+ ASSERT (a <= b);
if (a == 0)
return 0;
@@ -148,15 +141,11 @@ mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
if (a == 1)
goto done;
- if (a >= b)
- goto a_gt_b;
-
for (;;)
{
result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
MP_LIMB_T_SWAP (a, b);
- a_gt_b:
do
{
/* working on (a/b), a,b odd, a>=b */
@@ -177,67 +166,3 @@ mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
done:
return JACOBI_BIT1_TO_PN (result_bit1);
}
-#endif
-
-#if JACOBI_BASE_METHOD == 4
-/* Computes (a/b) for odd b > 1 and any a. The initial bit is taken as a
- * parameter. We have no need for the convention that the sign is in
- * bit 1, internally we use bit 0. */
-
-/* FIXME: Could try table-based count_trailing_zeros. */
-int
-mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
-{
- int c;
-
- ASSERT (b & 1);
- ASSERT (b > 1);
-
- if (a == 0)
- /* This is the only line which depends on b > 1 */
- return 0;
-
- bit >>= 1;
-
- /* Below, we represent a and b shifted right so that the least
- significant one bit is implicit. */
-
- b >>= 1;
-
- count_trailing_zeros (c, a);
- bit ^= c & (b ^ (b >> 1));
-
- /* We may have c==GMP_LIMB_BITS-1, so we can't use a>>c+1. */
- a >>= c;
- a >>= 1;
-
- do
- {
- mp_limb_t t = a - b;
- mp_limb_t bgta = LIMB_HIGHBIT_TO_MASK (t);
-
- if (t == 0)
- return 0;
-
- /* If b > a, invoke reciprocity */
- bit ^= (bgta & a & b);
-
- /* b <-- min (a, b) */
- b += (bgta & t);
-
- /* a <-- |a - b| */
- a = (t ^ bgta) - bgta;
-
- /* Number of trailing zeros is the same no matter if we look at
- * t or a, but using t gives more parallelism. */
- count_trailing_zeros (c, t);
- c ++;
- /* (2/b) = -1 if b = 3 or 5 mod 8 */
- bit ^= c & (b ^ (b >> 1));
- a >>= c;
- }
- while (b > 0);
-
- return 1-2*(bit & 1);
-}
-#endif /* JACOBI_BASE_METHOD == 4 */
diff --git a/gmp/mpn/generic/jacobi.c b/gmp/mpn/generic/jacobi.c
deleted file mode 100644
index bdc3ec67da..0000000000
--- a/gmp/mpn/generic/jacobi.c
+++ /dev/null
@@ -1,295 +0,0 @@
-/* jacobi.c
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 1996, 1998, 2000-2004, 2008, 2010, 2011 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#ifndef JACOBI_DC_THRESHOLD
-#define JACOBI_DC_THRESHOLD GCD_DC_THRESHOLD
-#endif
-
-/* Schönhage's rules:
- *
- * Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3
- *
- * If r1 is odd, then
- *
- * (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1)
- *
- * where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)].
- *
- * If r1 is even, r2 must be odd. We have
- *
- * (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0)
- * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1)
- * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1)
- *
- * Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating
- * q1 times gives
- *
- * (r1 | r0) = (r1 | r2) = (r3 | r2)
- *
- * On the other hand, if r1 = 2 (mod 4), the sign factor is
- * (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent
- *
- * (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2
- * = q1 (r0-1)/2 + q1 (q1-1)/2
- *
- * and we can summarize the even case as
- *
- * (r1 | r0) = t(r1, r0, q1) (r3 | r2)
- *
- * where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)}
- *
- * What about termination? The remainder sequence ends with (0|1) = 1
- * (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is
- * odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and
- * hence non-zero. We may have r3 = r1 - q2 r2 = 0.
- *
- * Examples: (11|15) = - (15|11) = - (4|11)
- * (4|11) = (4| 3) = (1| 3)
- * (1| 3) = (3|1) = (0|1) = 1
- *
- * (2|7) = (2|1) = (0|1) = 1
- *
- * Detail: (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5)
- * (2|5) = (2-5|5) = (-1|5)(3|5) = (5|3) = (2|3)
- * (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1)
- *
- */
-
-/* In principle, the state consists of four variables: e (one bit), a,
- b (two bits each), d (one bit). Collected factors are (-1)^e. a and
- b are the least significant bits of the current remainders. d
- (denominator) is 0 if we're currently subtracting multiplies of a
- from b, and 1 if we're subtracting b from a.
-
- e is stored in the least significant bit, while a, b and d are
- coded as only 13 distinct values in bits 1-4, according to the
- following table. For rows not mentioning d, the value is either
- implied, or it doesn't matter. */
-
-#if WANT_ASSERT
-static const struct
-{
- unsigned char a;
- unsigned char b;
-} decode_table[13] = {
- /* 0 */ { 0, 1 },
- /* 1 */ { 0, 3 },
- /* 2 */ { 1, 1 },
- /* 3 */ { 1, 3 },
- /* 4 */ { 2, 1 },
- /* 5 */ { 2, 3 },
- /* 6 */ { 3, 1 },
- /* 7 */ { 3, 3 }, /* d = 1 */
- /* 8 */ { 1, 0 },
- /* 9 */ { 1, 2 },
- /* 10 */ { 3, 0 },
- /* 11 */ { 3, 2 },
- /* 12 */ { 3, 3 }, /* d = 0 */
-};
-#define JACOBI_A(bits) (decode_table[(bits)>>1].a)
-#define JACOBI_B(bits) (decode_table[(bits)>>1].b)
-#endif /* WANT_ASSERT */
-
-const unsigned char jacobi_table[208] = {
-#include "jacobitab.h"
-};
-
-#define BITS_FAIL 31
-
-static void
-jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn,
- mp_srcptr qp, mp_size_t qn, int d)
-{
- unsigned *bitsp = (unsigned *) p;
-
- if (gp)
- {
- ASSERT (gn > 0);
- if (gn != 1 || gp[0] != 1)
- {
- *bitsp = BITS_FAIL;
- return;
- }
- }
-
- if (qp)
- {
- ASSERT (qn > 0);
- ASSERT (d >= 0);
- *bitsp = mpn_jacobi_update (*bitsp, d, qp[0] & 3);
- }
-}
-
-#define CHOOSE_P(n) (2*(n) / 3)
-
-int
-mpn_jacobi_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned bits)
-{
- mp_size_t scratch;
- mp_size_t matrix_scratch;
- mp_ptr tp;
-
- TMP_DECL;
-
- ASSERT (n > 0);
- ASSERT ( (ap[n-1] | bp[n-1]) > 0);
- ASSERT ( (bp[0] | ap[0]) & 1);
-
- /* FIXME: Check for small sizes first, before setting up temporary
- storage etc. */
- scratch = MPN_GCD_SUBDIV_STEP_ITCH(n);
-
- if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
- {
- mp_size_t hgcd_scratch;
- mp_size_t update_scratch;
- mp_size_t p = CHOOSE_P (n);
- mp_size_t dc_scratch;
-
- matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
- hgcd_scratch = mpn_hgcd_itch (n - p);
- update_scratch = p + n - 1;
-
- dc_scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
- if (dc_scratch > scratch)
- scratch = dc_scratch;
- }
-
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS(scratch);
-
- while (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD))
- {
- struct hgcd_matrix M;
- mp_size_t p = 2*n/3;
- mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
- mp_size_t nn;
- mpn_hgcd_matrix_init (&M, n - p, tp);
-
- nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M, &bits,
- tp + matrix_scratch);
- if (nn > 0)
- {
- ASSERT (M.n <= (n - p - 1)/2);
- ASSERT (M.n + p <= (p + n - 1) / 2);
- /* Temporary storage 2 (p + M->n) <= p + n - 1. */
- n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
- }
- else
- {
- /* Temporary storage n */
- n = mpn_gcd_subdiv_step (ap, bp, n, 0, jacobi_hook, &bits, tp);
- if (!n)
- {
- TMP_FREE;
- return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits);
- }
- }
- }
-
- while (n > 2)
- {
- struct hgcd_matrix1 M;
- mp_limb_t ah, al, bh, bl;
- mp_limb_t mask;
-
- mask = ap[n-1] | bp[n-1];
- ASSERT (mask > 0);
-
- if (mask & GMP_NUMB_HIGHBIT)
- {
- ah = ap[n-1]; al = ap[n-2];
- bh = bp[n-1]; bl = bp[n-2];
- }
- else
- {
- int shift;
-
- count_leading_zeros (shift, mask);
- ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
- al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
- bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
- bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
- }
-
- /* Try an mpn_nhgcd2 step */
- if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits))
- {
- n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n);
- MP_PTR_SWAP (ap, tp);
- }
- else
- {
- /* mpn_hgcd2 has failed. Then either one of a or b is very
- small, or the difference is very small. Perform one
- subtraction followed by one division. */
- n = mpn_gcd_subdiv_step (ap, bp, n, 0, &jacobi_hook, &bits, tp);
- if (!n)
- {
- TMP_FREE;
- return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits);
- }
- }
- }
-
- if (bits >= 16)
- MP_PTR_SWAP (ap, bp);
-
- ASSERT (bp[0] & 1);
-
- if (n == 1)
- {
- mp_limb_t al, bl;
- al = ap[0];
- bl = bp[0];
-
- TMP_FREE;
- if (bl == 1)
- return 1 - 2*(bits & 1);
- else
- return mpn_jacobi_base (al, bl, bits << 1);
- }
-
- else
- {
- int res = mpn_jacobi_2 (ap, bp, bits & 1);
- TMP_FREE;
- return res;
- }
-}
diff --git a/gmp/mpn/generic/jacobi_2.c b/gmp/mpn/generic/jacobi_2.c
deleted file mode 100644
index 9f480f7834..0000000000
--- a/gmp/mpn/generic/jacobi_2.c
+++ /dev/null
@@ -1,352 +0,0 @@
-/* jacobi_2.c
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 1996, 1998, 2000-2004, 2008, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#ifndef JACOBI_2_METHOD
-#define JACOBI_2_METHOD 2
-#endif
-
-/* Computes (a / b) where b is odd, and a and b are otherwise arbitrary
- two-limb numbers. */
-#if JACOBI_2_METHOD == 1
-int
-mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
-{
- mp_limb_t ah, al, bh, bl;
- int c;
-
- al = ap[0];
- ah = ap[1];
- bl = bp[0];
- bh = bp[1];
-
- ASSERT (bl & 1);
-
- bl = ((bh << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK) | (bl >> 1);
- bh >>= 1;
-
- if ( (bh | bl) == 0)
- return 1 - 2*(bit & 1);
-
- if ( (ah | al) == 0)
- return 0;
-
- if (al == 0)
- {
- al = ah;
- ah = 0;
- bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
- }
- count_trailing_zeros (c, al);
- bit ^= c & (bl ^ (bl >> 1));
-
- c++;
- if (UNLIKELY (c == GMP_NUMB_BITS))
- {
- al = ah;
- ah = 0;
- }
- else
- {
- al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
- ah >>= c;
- }
- while ( (ah | bh) > 0)
- {
- mp_limb_t th, tl;
- mp_limb_t bgta;
-
- sub_ddmmss (th, tl, ah, al, bh, bl);
- if ( (tl | th) == 0)
- return 0;
-
- bgta = LIMB_HIGHBIT_TO_MASK (th);
-
- /* If b > a, invoke reciprocity */
- bit ^= (bgta & al & bl);
-
- /* b <-- min (a, b) */
- add_ssaaaa (bh, bl, bh, bl, th & bgta, tl & bgta);
-
- if ( (bh | bl) == 0)
- return 1 - 2*(bit & 1);
-
- /* a <-- |a - b| */
- al = (bgta ^ tl) - bgta;
- ah = (bgta ^ th);
-
- if (UNLIKELY (al == 0))
- {
- /* If b > a, al == 0 implies that we have a carry to
- propagate. */
- al = ah - bgta;
- ah = 0;
- bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
- }
- count_trailing_zeros (c, al);
- c++;
- bit ^= c & (bl ^ (bl >> 1));
-
- if (UNLIKELY (c == GMP_NUMB_BITS))
- {
- al = ah;
- ah = 0;
- }
- else
- {
- al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
- ah >>= c;
- }
- }
-
- ASSERT (bl > 0);
-
- while ( (al | bl) & GMP_LIMB_HIGHBIT)
- {
- /* Need an extra comparison to get the mask. */
- mp_limb_t t = al - bl;
- mp_limb_t bgta = - (bl > al);
-
- if (t == 0)
- return 0;
-
- /* If b > a, invoke reciprocity */
- bit ^= (bgta & al & bl);
-
- /* b <-- min (a, b) */
- bl += (bgta & t);
-
- /* a <-- |a - b| */
- al = (t ^ bgta) - bgta;
-
- /* Number of trailing zeros is the same no matter if we look at
- * t or a, but using t gives more parallelism. */
- count_trailing_zeros (c, t);
- c ++;
- /* (2/b) = -1 if b = 3 or 5 mod 8 */
- bit ^= c & (bl ^ (bl >> 1));
-
- if (UNLIKELY (c == GMP_NUMB_BITS))
- return 1 - 2*(bit & 1);
-
- al >>= c;
- }
-
- /* Here we have a little impedance mismatch. Better to inline it? */
- return mpn_jacobi_base (2*al+1, 2*bl+1, bit << 1);
-}
-#elif JACOBI_2_METHOD == 2
-int
-mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
-{
- mp_limb_t ah, al, bh, bl;
- int c;
-
- al = ap[0];
- ah = ap[1];
- bl = bp[0];
- bh = bp[1];
-
- ASSERT (bl & 1);
-
- /* Use bit 1. */
- bit <<= 1;
-
- if (bh == 0 && bl == 1)
- /* (a|1) = 1 */
- return 1 - (bit & 2);
-
- if (al == 0)
- {
- if (ah == 0)
- /* (0|b) = 0, b > 1 */
- return 0;
-
- count_trailing_zeros (c, ah);
- bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
-
- al = bl;
- bl = ah >> c;
-
- if (bl == 1)
- /* (1|b) = 1 */
- return 1 - (bit & 2);
-
- ah = bh;
-
- bit ^= al & bl;
-
- goto b_reduced;
- }
- if ( (al & 1) == 0)
- {
- count_trailing_zeros (c, al);
-
- al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
- ah >>= c;
- bit ^= (c << 1) & (bl ^ (bl >> 1));
- }
- if (ah == 0)
- {
- if (bh > 0)
- {
- bit ^= al & bl;
- MP_LIMB_T_SWAP (al, bl);
- ah = bh;
- goto b_reduced;
- }
- goto ab_reduced;
- }
-
- while (bh > 0)
- {
- /* Compute (a|b) */
- while (ah > bh)
- {
- sub_ddmmss (ah, al, ah, al, bh, bl);
- if (al == 0)
- {
- count_trailing_zeros (c, ah);
- bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
-
- al = bl;
- bl = ah >> c;
- ah = bh;
-
- bit ^= al & bl;
- goto b_reduced;
- }
- count_trailing_zeros (c, al);
- bit ^= (c << 1) & (bl ^ (bl >> 1));
- al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
- ah >>= c;
- }
- if (ah == bh)
- goto cancel_hi;
-
- if (ah == 0)
- {
- bit ^= al & bl;
- MP_LIMB_T_SWAP (al, bl);
- ah = bh;
- break;
- }
-
- bit ^= al & bl;
-
- /* Compute (b|a) */
- while (bh > ah)
- {
- sub_ddmmss (bh, bl, bh, bl, ah, al);
- if (bl == 0)
- {
- count_trailing_zeros (c, bh);
- bit ^= ((GMP_NUMB_BITS + c) << 1) & (al ^ (al >> 1));
-
- bl = bh >> c;
- bit ^= al & bl;
- goto b_reduced;
- }
- count_trailing_zeros (c, bl);
- bit ^= (c << 1) & (al ^ (al >> 1));
- bl = ((bh << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (bl >> c);
- bh >>= c;
- }
- bit ^= al & bl;
-
- /* Compute (a|b) */
- if (ah == bh)
- {
- cancel_hi:
- if (al < bl)
- {
- MP_LIMB_T_SWAP (al, bl);
- bit ^= al & bl;
- }
- al -= bl;
- if (al == 0)
- return 0;
-
- count_trailing_zeros (c, al);
- bit ^= (c << 1) & (bl ^ (bl >> 1));
- al >>= c;
-
- if (al == 1)
- return 1 - (bit & 2);
-
- MP_LIMB_T_SWAP (al, bl);
- bit ^= al & bl;
- break;
- }
- }
-
- b_reduced:
- /* Compute (a|b), with b a single limb. */
- ASSERT (bl & 1);
-
- if (bl == 1)
- /* (a|1) = 1 */
- return 1 - (bit & 2);
-
- while (ah > 0)
- {
- ah -= (al < bl);
- al -= bl;
- if (al == 0)
- {
- if (ah == 0)
- return 0;
- count_trailing_zeros (c, ah);
- bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
- al = ah >> c;
- goto ab_reduced;
- }
- count_trailing_zeros (c, al);
-
- al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
- ah >>= c;
- bit ^= (c << 1) & (bl ^ (bl >> 1));
- }
- ab_reduced:
- ASSERT (bl & 1);
- ASSERT (bl > 1);
-
- return mpn_jacobi_base (al, bl, bit);
-}
-#else
-#error Unsupported value for JACOBI_2_METHOD
-#endif
diff --git a/gmp/mpn/generic/logops_n.c b/gmp/mpn/generic/logops_n.c
deleted file mode 100644
index 1b534ff4ba..0000000000
--- a/gmp/mpn/generic/logops_n.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* mpn_and_n, mpn_ior_n, etc -- mpn logical operations.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#ifdef OPERATION_and_n
-#define func __MPN(and_n)
-#define call mpn_and_n
-#endif
-
-#ifdef OPERATION_andn_n
-#define func __MPN(andn_n)
-#define call mpn_andn_n
-#endif
-
-#ifdef OPERATION_nand_n
-#define func __MPN(nand_n)
-#define call mpn_nand_n
-#endif
-
-#ifdef OPERATION_ior_n
-#define func __MPN(ior_n)
-#define call mpn_ior_n
-#endif
-
-#ifdef OPERATION_iorn_n
-#define func __MPN(iorn_n)
-#define call mpn_iorn_n
-#endif
-
-#ifdef OPERATION_nior_n
-#define func __MPN(nior_n)
-#define call mpn_nior_n
-#endif
-
-#ifdef OPERATION_xor_n
-#define func __MPN(xor_n)
-#define call mpn_xor_n
-#endif
-
-#ifdef OPERATION_xnor_n
-#define func __MPN(xnor_n)
-#define call mpn_xnor_n
-#endif
-
-void
-func (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
-{
- call (rp, up, vp, n);
-}
diff --git a/gmp/mpn/generic/lshift.c b/gmp/mpn/generic/lshift.c
index 5182632976..fdc7e4423e 100644
--- a/gmp/mpn/generic/lshift.c
+++ b/gmp/mpn/generic/lshift.c
@@ -1,32 +1,22 @@
/* mpn_lshift -- Shift left low level.
-Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1996, 2000, 2001, 2002 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/lshiftc.c b/gmp/mpn/generic/lshiftc.c
deleted file mode 100644
index e8051b7b93..0000000000
--- a/gmp/mpn/generic/lshiftc.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/* mpn_lshiftc -- Shift left low level with complement.
-
-Copyright 1991, 1993, 1994, 1996, 2000-2002, 2009 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Shift U (pointed to by up and n limbs long) cnt bits to the left
- and store the n least significant limbs of the result at rp.
- Return the bits shifted out from the most significant limb.
-
- Argument constraints:
- 1. 0 < cnt < GMP_NUMB_BITS.
- 2. If the result is to be written over the input, rp must be >= up.
-*/
-
-mp_limb_t
-mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt)
-{
- mp_limb_t high_limb, low_limb;
- unsigned int tnc;
- mp_size_t i;
- mp_limb_t retval;
-
- ASSERT (n >= 1);
- ASSERT (cnt >= 1);
- ASSERT (cnt < GMP_NUMB_BITS);
- ASSERT (MPN_SAME_OR_DECR_P (rp, up, n));
-
- up += n;
- rp += n;
-
- tnc = GMP_NUMB_BITS - cnt;
- low_limb = *--up;
- retval = low_limb >> tnc;
- high_limb = (low_limb << cnt);
-
- for (i = n - 1; i != 0; i--)
- {
- low_limb = *--up;
- *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK;
- high_limb = low_limb << cnt;
- }
- *--rp = (~high_limb) & GMP_NUMB_MASK;
-
- return retval;
-}
diff --git a/gmp/mpn/generic/matrix22_mul.c b/gmp/mpn/generic/matrix22_mul.c
index 59531eb1b2..f979385d9d 100644
--- a/gmp/mpn/generic/matrix22_mul.c
+++ b/gmp/mpn/generic/matrix22_mul.c
@@ -1,38 +1,25 @@
/* matrix22_mul.c.
- Contributed by Niels Möller and Marco Bodrato.
-
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -83,198 +70,143 @@ mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
|| BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
return 3*rn + 2*mn;
else
- return 3*(rn + mn) + 5;
+ return 4*(rn + mn) + 5;
}
/* Algorithm:
/ s0 \ / 1 0 0 0 \ / r0 \
- | s1 | | 0 1 0 1 | | r1 |
- | s2 | | 0 0 -1 1 | | r2 |
- | s3 | = | 0 1 -1 1 | \ r3 /
- | s4 | | -1 1 -1 1 |
- | s5 | | 0 1 0 0 |
- \ s6 / \ 0 0 1 0 /
+ | s1 | | 0 1 0 0 | | r1 |
+ | s2 | | 0 0 1 1 | | r2 |
+ | s3 | = | -1 0 1 1 | \ r3 /
+ | s4 | | 1 0 -1 0 |
+ | s5 | | 1 1 -1 -1 |
+ \ s6 / \ 0 0 0 1 /
/ t0 \ / 1 0 0 0 \ / m0 \
- | t1 | | 0 1 0 1 | | m1 |
- | t2 | | 0 0 -1 1 | | m2 |
- | t3 | = | 0 1 -1 1 | \ m3 /
- | t4 | | -1 1 -1 1 |
- | t5 | | 0 1 0 0 |
- \ t6 / \ 0 0 1 0 /
-
- Note: the two matrices above are the same, but s_i and t_i are used
- in the same product, only for i<4, see "A Strassen-like Matrix
- Multiplication suited for squaring and higher power computation" by
- M. Bodrato, in Proceedings of ISSAC 2010.
-
- / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \
- | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 |
- | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 |
- \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 |
- | s4*t5 |
- | s5*t6 |
- \ s6*t4 /
-
- The scheduling uses two temporaries U0 and U1 to store products, and
- two, S0 and T0, to store combinations of entries of the two
- operands.
+ | t1 | | 0 0 1 0 | | m1 |
+ | t2 | | -1 1 0 0 | | m2 |
+ | t3 | = | 1 -1 0 1 | \ m3 /
+ | t4 | | 0 -1 0 1 |
+ | t5 | | 0 0 0 1 |
+ \ t6 / \ -1 1 1 -1 /
+
+ / r0 \ / 1 1 0 0 0 0 0 \ / s0 * t0 \
+ | r1 | = | 1 0 1 1 0 1 0 | | s1 * t1 |
+ | r2 | | 1 0 0 1 1 0 1 | | s2 * t2 |
+ \ r3 / \ 1 0 1 1 1 0 0 / | s3 * t3 |
+ | s4 * t4 |
+ | s5 * t5 |
+ \ s6 * t6 /
*/
/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
*
* Resulting elements are of size up to rn + mn + 1.
*
- * Temporary storage: 3 rn + 3 mn + 5. */
+ * Temporary storage: 4 rn + 4 mn + 5. */
void
mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
mp_ptr tp)
{
- mp_ptr s0, t0, u0, u1;
- int r1s, r3s, s0s, t0s, u1s;
- s0 = tp; tp += rn + 1;
- t0 = tp; tp += mn + 1;
+ mp_ptr s2, s3, t2, t3, u0, u1;
+ int r2s, r3s, s3s, t2s, t3s, u0s, u1s;
+ s2 = tp; tp += rn;
+ s3 = tp; tp += rn + 1;
+ t2 = tp; tp += mn;
+ t3 = tp; tp += mn + 1;
u0 = tp; tp += rn + mn + 1;
u1 = tp; /* rn + mn + 2 */
- MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */
- r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */
- if (r3s)
- {
- r1s = abs_sub_n (r1, r1, r3, rn);
- r1[rn] = 0;
- }
- else
- {
- r1[rn] = mpn_add_n (r1, r1, r3, rn);
- r1s = 0; /* r1 - r2 + r3 */
- }
- if (r1s)
- {
- s0[rn] = mpn_add_n (s0, r1, r0, rn);
- s0s = 0;
- }
- else if (r1[rn] != 0)
- {
- s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
- s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */
- /* Reverse sign! */
- }
- else
- {
- s0s = abs_sub_n (s0, r0, r1, rn);
- s0[rn] = 0;
- }
- MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */
- r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
- ASSERT (r0[rn+mn] < 2); /* u0 + u5 */
+ MUL (u0, r0, rn, m0, mn); /* 0 */
+ MUL (u1, r1, rn, m2, mn); /* 1 */
- t0s = abs_sub_n (t0, m3, m2, mn);
- u1s = r3s^t0s^1; /* Reverse sign! */
- MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */
- u1[rn+mn] = 0;
- if (t0s)
- {
- t0s = abs_sub_n (t0, m1, t0, mn);
- t0[mn] = 0;
- }
- else
- {
- t0[mn] = mpn_add_n (t0, t0, m1, mn);
- }
+ MPN_COPY (s2, r3, rn);
- /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
- at r3. I'd expect that for matrices of random size, the high
- words t0[mn] and r1[rn] are non-zero with a pretty small
- probability. If that can be confirmed this should be done as an
- unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
- add_n. */
- if (t0[mn] != 0)
+ r3[rn] = mpn_add_n (r3, r3, r2, rn);
+ r0[rn] = 0;
+ s3s = abs_sub_n (s3, r3, r0, rn + 1);
+ t2s = abs_sub_n (t2, m1, m0, mn);
+ if (t2s)
{
- MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */
- ASSERT (r1[rn] < 2);
- if (r1[rn] != 0)
- mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
+ t3[mn] = mpn_add_n (t3, m3, t2, mn);
+ t3s = 0;
}
else
{
- MUL (r3, r1, rn + 1, t0, mn);
+ t3s = abs_sub_n (t3, m3, t2, mn);
+ t3[mn] = 0;
}
- ASSERT (r3[rn+mn] < 4);
+ r2s = abs_sub_n (r2, r0, r2, rn);
+ r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
- u0[rn+mn] = 0;
- if (r1s^t0s)
+ MUL(u1, s3, rn+1, t3, mn+1); /* 3 */
+ u1s = s3s ^ t3s;
+ ASSERT (u1[rn+mn+1] == 0);
+ ASSERT (u1[rn+mn] < 4);
+
+ if (u1s)
{
- r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
+ u0[rn+mn] = 0;
+ u0s = abs_sub_n (u0, u0, u1, rn + mn + 1);
}
else
{
- ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
- r3s = 0; /* u3 + u5 */
+ u0[rn+mn] = u1[rn+mn] + mpn_add_n (u0, u0, u1, rn + mn);
+ u0s = 0;
}
+ MUL(u1, r3, rn + 1, t2, mn); /* 2 */
+ u1s = t2s;
+ ASSERT (u1[rn+mn] < 2);
- if (t0s)
- {
- t0[mn] = mpn_add_n (t0, t0, m0, mn);
- }
- else if (t0[mn] != 0)
- {
- t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
- }
- else
+ u1s = add_signed_n (u1, u0, u0s, u1, u1s, rn + mn + 1);
+
+ t2s = abs_sub_n (t2, m3, m1, mn);
+ if (s3s)
{
- t0s = abs_sub_n (t0, t0, m0, mn);
+ s3[rn] += mpn_add_n (s3, s3, r1, rn);
+ s3s = 0;
}
- MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */
- ASSERT (u0[rn+mn] < 2);
- if (r1s)
+ else if (s3[rn] > 0)
{
- ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
+ s3[rn] -= mpn_sub_n (s3, s3, r1, rn);
+ s3s = 1;
}
else
{
- r1[rn] += mpn_add_n (r1, r1, r2, rn);
- }
- rn++;
- t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
- /* u3 + u5 + u6 */
- ASSERT (r2[rn+mn-1] < 4);
- r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
- /* -u2 + u3 + u5 */
- ASSERT (r3[rn+mn-1] < 3);
- MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */
- ASSERT (u0[rn+mn-1] < 2);
- t0[mn] = mpn_add_n (t0, m3, m1, mn);
- MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */
- mn += rn;
- ASSERT (u1[mn-1] < 4);
- ASSERT (u1[mn] == 0);
- ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
- /* -u2 + u3 - u4 + u5 */
- ASSERT (r1[mn-1] < 2);
- if (r3s)
- {
- ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
+ s3s = abs_sub_n (s3, r1, s3, rn);
}
- else
+ MUL (r1, s3, rn+1, m3, mn); /* 5 */
+ ASSERT_NOCARRY(add_signed_n (r1, r1, s3s, u1, u1s, rn + mn + 1));
+ ASSERT (r1[rn + mn] < 2);
+
+ MUL (r3, r2, rn, t2, mn); /* 4 */
+ r3s = r2s ^ t2s;
+ r3[rn + mn] = 0;
+ u0s = add_signed_n (u0, u0, u0s, r3, r3s, rn + mn + 1);
+ ASSERT_NOCARRY (add_signed_n (r3, r3, r3s, u1, u1s, rn + mn + 1));
+ ASSERT (r3[rn + mn] < 2);
+
+ if (t3s)
{
- ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
- /* u1 + u2 - u3 - u5 */
+ t3[mn] += mpn_add_n (t3, m2, t3, mn);
+ t3s = 0;
}
- ASSERT (r3[mn-1] < 2);
- if (t0s)
+ else if (t3[mn] > 0)
{
- ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
+ t3[mn] -= mpn_sub_n (t3, t3, m2, mn);
+ t3s = 1;
}
else
{
- ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
- /* u1 - u3 - u5 - u6 */
+ t3s = abs_sub_n (t3, m2, t3, mn);
}
- ASSERT (r2[mn-1] < 2);
+ MUL (r2, s2, rn, t3, mn + 1); /* 6 */
+
+ ASSERT_NOCARRY (add_signed_n (r2, r2, t3s, u0, u0s, rn + mn + 1));
+ ASSERT (r2[rn + mn] < 2);
}
void
diff --git a/gmp/mpn/generic/matrix22_mul1_inverse_vector.c b/gmp/mpn/generic/matrix22_mul1_inverse_vector.c
deleted file mode 100644
index 83b2fb5134..0000000000
--- a/gmp/mpn/generic/matrix22_mul1_inverse_vector.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/* matrix22_mul1_inverse_vector.c
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2008, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from
- the left. Uses three buffers, to avoid a copy. */
-mp_size_t
-mpn_matrix22_mul1_inverse_vector (const struct hgcd_matrix1 *M,
- mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
-{
- mp_limb_t h0, h1;
-
- /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
-
- r = u11 * a
- r -= u01 * b
- b *= u00
- b -= u10 * a
- */
-
- h0 = mpn_mul_1 (rp, ap, n, M->u[1][1]);
- h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]);
- ASSERT (h0 == h1);
-
- h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]);
- h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]);
- ASSERT (h0 == h1);
-
- n -= (rp[n-1] | bp[n-1]) == 0;
- return n;
-}
diff --git a/gmp/mpn/generic/mod_1.c b/gmp/mpn/generic/mod_1.c
index 0212020201..7c892814e1 100644
--- a/gmp/mpn/generic/mod_1.c
+++ b/gmp/mpn/generic/mod_1.c
@@ -3,34 +3,23 @@
Return the single-limb remainder.
There are no constraints on the value of the divisor.
-Copyright 1991, 1993, 1994, 1999, 2000, 2002, 2007-2009, 2012 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2002, 2007, 2008, 2009 Free
+Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -53,43 +42,18 @@ see https://www.gnu.org/licenses/. */
#define MOD_1_UNNORM_THRESHOLD 0
#endif
-#ifndef MOD_1U_TO_MOD_1_1_THRESHOLD
-#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */
+#ifndef MOD_1_1_THRESHOLD
+#define MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */
#endif
-#ifndef MOD_1N_TO_MOD_1_1_THRESHOLD
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */
+#ifndef MOD_1_2_THRESHOLD
+#define MOD_1_2_THRESHOLD 10
#endif
-#ifndef MOD_1_1_TO_MOD_1_2_THRESHOLD
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#ifndef MOD_1_4_THRESHOLD
+#define MOD_1_4_THRESHOLD 120
#endif
-#ifndef MOD_1_2_TO_MOD_1_4_THRESHOLD
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
-#endif
-
-#if TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p
-/* Duplicates declarations in tune/speed.h */
-mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
-mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
-
-void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
-void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
-
-#undef mpn_mod_1_1p
-#define mpn_mod_1_1p(ap, n, b, pre) \
- (mod_1_1p_method == 1 ? mpn_mod_1_1p_1 (ap, n, b, pre) \
- : (mod_1_1p_method == 2 ? mpn_mod_1_1p_2 (ap, n, b, pre) \
- : __gmpn_mod_1_1p (ap, n, b, pre)))
-
-#undef mpn_mod_1_1p_cps
-#define mpn_mod_1_1p_cps(pre, b) \
- (mod_1_1p_method == 1 ? mpn_mod_1_1p_cps_1 (pre, b) \
- : (mod_1_1p_method == 2 ? mpn_mod_1_1p_cps_2 (pre, b) \
- : __gmpn_mod_1_1p_cps (pre, b)))
-#endif /* TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p */
-
/* The comments in mpn/generic/divrem_1.c apply here too.
@@ -150,12 +114,12 @@ mpn_mod_1_unnorm (mp_srcptr up, mp_size_t un, mp_limb_t d)
if (UDIV_NEEDS_NORMALIZATION
&& BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD))
{
- mp_limb_t nshift;
for (i = un - 2; i >= 0; i--)
{
n0 = up[i] << GMP_NAIL_BITS;
- nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
- udiv_qrnnd (dummy, r, r, nshift, d);
+ udiv_qrnnd (dummy, r, r,
+ (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)),
+ d);
r >>= GMP_NAIL_BITS;
n1 = n0;
}
@@ -165,18 +129,19 @@ mpn_mod_1_unnorm (mp_srcptr up, mp_size_t un, mp_limb_t d)
}
else
{
- mp_limb_t inv, nshift;
+ mp_limb_t inv;
invert_limb (inv, d);
for (i = un - 2; i >= 0; i--)
{
n0 = up[i] << GMP_NAIL_BITS;
- nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
- udiv_rnnd_preinv (r, r, nshift, d, inv);
+ udiv_qrnnd_preinv (dummy, r, r,
+ (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)),
+ d, inv);
r >>= GMP_NAIL_BITS;
n1 = n0;
}
- udiv_rnnd_preinv (r, r, n1 << cnt, d, inv);
+ udiv_qrnnd_preinv (dummy, r, r, n1 << cnt, d, inv);
r >>= GMP_NAIL_BITS;
return r >> cnt;
}
@@ -222,7 +187,7 @@ mpn_mod_1_norm (mp_srcptr up, mp_size_t un, mp_limb_t d)
for (i = un - 1; i >= 0; i--)
{
n0 = up[i] << GMP_NAIL_BITS;
- udiv_rnnd_preinv (r, r, n0, d, inv);
+ udiv_qrnnd_preinv (dummy, r, r, n0, d, inv);
r >>= GMP_NAIL_BITS;
}
return r;
@@ -242,40 +207,29 @@ mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
{
- if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
- {
- return mpn_mod_1_norm (ap, n, b);
- }
- else
- {
- mp_limb_t pre[4];
- mpn_mod_1_1p_cps (pre, b);
- return mpn_mod_1_1p (ap, n, b, pre);
- }
+ /* The functions below do not handle this large divisor. */
+ return mpn_mod_1_norm (ap, n, b);
+ }
+ else if (BELOW_THRESHOLD (n, MOD_1_1_THRESHOLD))
+ {
+ return mpn_mod_1_unnorm (ap, n, b);
+ }
+ else if (BELOW_THRESHOLD (n, MOD_1_2_THRESHOLD))
+ {
+ mp_limb_t pre[4];
+ mpn_mod_1s_1p_cps (pre, b);
+ return mpn_mod_1s_1p (ap, n, b << pre[1], pre);
+ }
+ else if (BELOW_THRESHOLD (n, MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
+ {
+ mp_limb_t pre[5];
+ mpn_mod_1s_2p_cps (pre, b);
+ return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
}
else
{
- if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
- {
- return mpn_mod_1_unnorm (ap, n, b);
- }
- else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
- {
- mp_limb_t pre[4];
- mpn_mod_1_1p_cps (pre, b);
- return mpn_mod_1_1p (ap, n, b << pre[1], pre);
- }
- else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
- {
- mp_limb_t pre[5];
- mpn_mod_1s_2p_cps (pre, b);
- return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
- }
- else
- {
- mp_limb_t pre[7];
- mpn_mod_1s_4p_cps (pre, b);
- return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
- }
+ mp_limb_t pre[7];
+ mpn_mod_1s_4p_cps (pre, b);
+ return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
}
}
diff --git a/gmp/mpn/generic/mod_1_1.c b/gmp/mpn/generic/mod_1_1.c
index 2e111399ed..27c7f8f1b6 100644
--- a/gmp/mpn/generic/mod_1_1.c
+++ b/gmp/mpn/generic/mod_1_1.c
@@ -1,208 +1,74 @@
-/* mpn_mod_1_1p (ap, n, b, cps)
+/* mpn_mod_1s_1p (ap, n, b, cps)
Divide (ap,,n) by b. Return the single-limb remainder.
+ Requires that b < B / 2.
- Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
- Based on a suggestion by Peter L. Montgomery.
+ Contributed to the GNU project by Torbjorn Granlund.
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2008-2011, 2013 Free Software Foundation, Inc.
+Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#ifndef MOD_1_1P_METHOD
-# define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */
-#endif
-
-/* Define some longlong.h-style macros, but for wider operations.
- * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
- * carry out, in the form of a mask. */
-
-#if defined (__GNUC__)
-
-#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add %6, %k2\n\t" \
- "adc %4, %k1\n\t" \
- "sbb %k0, %k0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
- "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add %6, %q2\n\t" \
- "adc %4, %q1\n\t" \
- "sbb %q0, %q0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
-#endif
-
-#if defined (__sparc__) && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addxcc %r3, %4, %1\n\t" \
- "subx %%g0, %%g0, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
- __CLOBBER_CC)
-#endif
-
-#if defined (__sparc__) && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addccc %r7, %8, %%g0\n\t" \
- "addccc %r3, %4, %1\n\t" \
- "clr %0\n\t" \
- "movcs %%xcc, -1, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
- "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
- __CLOBBER_CC)
-#if __VIS__ >= 0x300
-#undef add_mssaaaa
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "addcc %r5, %6, %2\n\t" \
- "addxccc %r3, %4, %1\n\t" \
- "clr %0\n\t" \
- "movcs %%xcc, -1, %0" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
- __CLOBBER_CC)
-#endif
-#endif
-
-#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
-/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
- processor running in 32-bit mode, since the carry flag then gets the 32-bit
- carry. */
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "add%I6c %2, %5, %6\n\t" \
- "adde %1, %3, %4\n\t" \
- "subfe %0, %0, %0\n\t" \
- "nor %0, %0, %0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
-#endif
-
-#if defined (__s390x__) && W_TYPE_SIZE == 64
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- __asm__ ( "algr %2, %6\n\t" \
- "alcgr %1, %4\n\t" \
- "lghi %0, 0\n\t" \
- "alcgr %0, %0\n\t" \
- "lcgr %0, %0" \
- : "=r" (m), "=r" (s1), "=&r" (s0) \
- : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
- "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
-#endif
-
-#if defined (__arm__) && W_TYPE_SIZE == 32
-#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
- __asm__ ( "adds %2, %5, %6\n\t" \
- "adcs %1, %3, %4\n\t" \
- "movcc %0, #0\n\t" \
- "movcs %0, #-1" \
- : "=r" (m), "=r" (sh), "=&r" (sl) \
- : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
-#endif
-#endif /* defined (__GNUC__) */
-
-#ifndef add_mssaaaa
-#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
- do { \
- UWtype __s0, __s1, __c0, __c1; \
- __s0 = (a0) + (b0); \
- __s1 = (a1) + (b1); \
- __c0 = __s0 < (a0); \
- __c1 = __s1 < (a1); \
- (s0) = __s0; \
- __s1 = __s1 + __c0; \
- (s1) = __s1; \
- (m) = - (__c1 + (__s1 < __c0)); \
- } while (0)
-#endif
-
-#if MOD_1_1P_METHOD == 1
void
-mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
+mpn_mod_1s_1p_cps (mp_limb_t cps[4], mp_limb_t b)
{
mp_limb_t bi;
mp_limb_t B1modb, B2modb;
int cnt;
+ ASSERT (b <= GMP_NUMB_MAX / 2);
+
count_leading_zeros (cnt, b);
b <<= cnt;
invert_limb (bi, b);
- cps[0] = bi;
- cps[1] = cnt;
-
- B1modb = -b;
- if (LIKELY (cnt != 0))
- B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+ B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
+ udiv_rnd_preinv (B2modb, B1modb, b, bi);
+
+ B1modb >>= cnt;
+ B2modb >>= cnt;
- /* In the normalized case, this can be simplified to
- *
- * B2modb = - b * bi;
- * ASSERT (B2modb <= b); // NB: equality iff b = B/2
- */
- udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
- cps[3] = B2modb >> cnt;
+ cps[0] = bi;
+ cps[1] = cnt;
+ cps[2] = B1modb;
+ cps[3] = B2modb;
}
mp_limb_t
-mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
+mpn_mod_1s_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
{
- mp_limb_t rh, rl, bi, ph, pl, r;
+ mp_limb_t rh, rl, bi, q, ph, pl, r;
mp_limb_t B1modb, B2modb;
mp_size_t i;
int cnt;
- mp_limb_t mask;
-
- ASSERT (n >= 2); /* fix tuneup.c if this is changed */
B1modb = bmodb[2];
B2modb = bmodb[3];
- rl = ap[n - 1];
- umul_ppmm (ph, pl, rl, B1modb);
- add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
+ umul_ppmm (ph, pl, ap[n - 1], B1modb);
+ add_ssaaaa (rh, rl, ph, pl, 0, ap[n - 2]);
for (i = n - 3; i >= 0; i -= 1)
{
@@ -211,122 +77,28 @@ mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
+ HI(rr) * (B^2 mod b) <= (B-1)(b-1)
*/
umul_ppmm (ph, pl, rl, B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i]);
umul_ppmm (rh, rl, rh, B2modb);
add_ssaaaa (rh, rl, rh, rl, ph, pl);
}
- cnt = bmodb[1];
bi = bmodb[0];
-
- if (LIKELY (cnt != 0))
- rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
-
- mask = -(mp_limb_t) (rh >= b);
- rh -= mask & b;
-
- udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
-
- return r >> cnt;
-}
-#endif /* MOD_1_1P_METHOD == 1 */
-
-#if MOD_1_1P_METHOD == 2
-void
-mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
-{
- mp_limb_t bi;
- mp_limb_t B2modb;
- int cnt;
-
- count_leading_zeros (cnt, b);
-
- b <<= cnt;
- invert_limb (bi, b);
-
- cps[0] = bi;
- cps[1] = cnt;
-
- if (LIKELY (cnt != 0))
- {
- mp_limb_t B1modb = -b;
- B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
- ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
- }
- B2modb = - b * bi;
- ASSERT (B2modb <= b); // NB: equality iff b = B/2
- cps[3] = B2modb;
-}
-
-mp_limb_t
-mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
-{
- int cnt;
- mp_limb_t bi, B1modb;
- mp_limb_t r0, r1;
- mp_limb_t r;
-
- ASSERT (n >= 2); /* fix tuneup.c if this is changed */
-
- r0 = ap[n-2];
- r1 = ap[n-1];
-
- if (n > 2)
- {
- mp_limb_t B2modb, B2mb;
- mp_limb_t p0, p1;
- mp_limb_t r2;
- mp_size_t j;
-
- B2modb = bmodb[3];
- B2mb = B2modb - b;
-
- umul_ppmm (p1, p0, r1, B2modb);
- add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
-
- for (j = n-4; j >= 0; j--)
- {
- mp_limb_t cy;
- /* mp_limb_t t = r0 + B2mb; */
- umul_ppmm (p1, p0, r1, B2modb);
-
- ADDC_LIMB (cy, r0, r0, r2 & B2modb);
- /* Alternative, for cmov: if (cy) r0 = t; */
- r0 -= (-cy) & b;
- add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
- }
-
- r1 -= (r2 & b);
- }
-
cnt = bmodb[1];
+#if 1
+ {
+ mp_limb_t mask;
+ r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+ mask = -(mp_limb_t) (r >= b);
+ r -= mask & b;
+ }
+#else
+ udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
+ (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
+ ASSERT (q <= 1); /* optimize for small quotient? */
+#endif
- if (LIKELY (cnt != 0))
- {
- mp_limb_t t;
- mp_limb_t B1modb = bmodb[2];
-
- umul_ppmm (r1, t, r1, B1modb);
- r0 += t;
- r1 += (r0 < t);
-
- /* Normalize */
- r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
- r0 <<= cnt;
-
- /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
- }
- else
- {
- mp_limb_t mask = -(mp_limb_t) (r1 >= b);
- r1 -= mask & b;
- }
-
- bi = bmodb[0];
+ udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
- udiv_rnnd_preinv (r, r1, r0, b, bi);
return r >> cnt;
}
-#endif /* MOD_1_1P_METHOD == 2 */
diff --git a/gmp/mpn/generic/mod_1_2.c b/gmp/mpn/generic/mod_1_2.c
index 7acf3dbdd1..ffadd536de 100644
--- a/gmp/mpn/generic/mod_1_2.c
+++ b/gmp/mpn/generic/mod_1_2.c
@@ -3,39 +3,27 @@
Requires that b < B / 2.
Contributed to the GNU project by Torbjorn Granlund.
- Based on a suggestion by Peter L. Montgomery.
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2008-2010 Free Software Foundation, Inc.
+Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -48,75 +36,49 @@ mpn_mod_1s_2p_cps (mp_limb_t cps[5], mp_limb_t b)
mp_limb_t B1modb, B2modb, B3modb;
int cnt;
- ASSERT (b <= (~(mp_limb_t) 0) / 2);
+ ASSERT (b <= GMP_NUMB_MAX / 2);
count_leading_zeros (cnt, b);
b <<= cnt;
invert_limb (bi, b);
- cps[0] = bi;
- cps[1] = cnt;
-
B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
+ udiv_rnd_preinv (B2modb, B1modb, b, bi);
+ udiv_rnd_preinv (B3modb, B2modb, b, bi);
- udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+ cps[0] = bi;
+ cps[1] = cnt;
+ cps[2] = B1modb >> cnt;
cps[3] = B2modb >> cnt;
-
- udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
cps[4] = B3modb >> cnt;
-
-#if WANT_ASSERT
- {
- int i;
- b = cps[2];
- for (i = 3; i <= 4; i++)
- {
- b += cps[i];
- ASSERT (b >= cps[i]);
- }
- }
-#endif
}
mp_limb_t
-mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5])
+mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t cps[5])
{
- mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+ mp_limb_t rh, rl, bi, q, ph, pl, ch, cl, r;
mp_limb_t B1modb, B2modb, B3modb;
mp_size_t i;
int cnt;
- ASSERT (n >= 1);
-
B1modb = cps[2];
B2modb = cps[3];
B3modb = cps[4];
if ((n & 1) != 0)
{
- if (n == 1)
- {
- rl = ap[n - 1];
- bi = cps[0];
- cnt = cps[1];
- udiv_rnnd_preinv (r, rl >> (GMP_LIMB_BITS - cnt),
- rl << cnt, b, bi);
- return r >> cnt;
- }
-
- umul_ppmm (ph, pl, ap[n - 2], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
umul_ppmm (rh, rl, ap[n - 1], B2modb);
+ umul_ppmm (ph, pl, ap[n - 2], B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
add_ssaaaa (rh, rl, rh, rl, ph, pl);
n--;
}
else
{
- rh = ap[n - 1];
- rl = ap[n - 2];
+ umul_ppmm (rh, rl, ap[n - 1], B1modb);
+ add_ssaaaa (rh, rl, rh, rl, 0, ap[n - 2]);
}
for (i = n - 4; i >= 0; i -= 2)
@@ -127,7 +89,7 @@ mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5])
+ HI(rr) * (B^3 mod b) <= (B-1)(b-1)
*/
umul_ppmm (ph, pl, ap[i + 1], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
umul_ppmm (ch, cl, rl, B2modb);
add_ssaaaa (ph, pl, ph, pl, ch, cl);
@@ -136,14 +98,20 @@ mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5])
add_ssaaaa (rh, rl, rh, rl, ph, pl);
}
- umul_ppmm (rh, cl, rh, B1modb);
- add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
-
- cnt = cps[1];
bi = cps[0];
+ cnt = cps[1];
+#if 1
+ umul_ppmm (rh, cl, rh, B1modb);
+ add_ssaaaa (rh, rl, rh, rl, 0, cl);
r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
- udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+#else
+ udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
+ (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
+ ASSERT (q <= 2); /* optimize for small quotient? */
+#endif
+
+ udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
return r >> cnt;
}
diff --git a/gmp/mpn/generic/mod_1_3.c b/gmp/mpn/generic/mod_1_3.c
index f4137f4315..77989fc0ae 100644
--- a/gmp/mpn/generic/mod_1_3.c
+++ b/gmp/mpn/generic/mod_1_3.c
@@ -3,39 +3,27 @@
Requires that d < B / 3.
Contributed to the GNU project by Torbjorn Granlund.
- Based on a suggestion by Peter L. Montgomery.
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2008-2010, 2013 Free Software Foundation, Inc.
+Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -48,82 +36,46 @@ mpn_mod_1s_3p_cps (mp_limb_t cps[6], mp_limb_t b)
mp_limb_t B1modb, B2modb, B3modb, B4modb;
int cnt;
- ASSERT (b <= (~(mp_limb_t) 0) / 3);
+ ASSERT (b <= GMP_NUMB_MAX / 3);
count_leading_zeros (cnt, b);
b <<= cnt;
invert_limb (bi, b);
- cps[0] = bi;
- cps[1] = cnt;
-
B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
+ udiv_rnd_preinv (B2modb, B1modb, b, bi);
+ udiv_rnd_preinv (B3modb, B2modb, b, bi);
+ udiv_rnd_preinv (B4modb, B3modb, b, bi);
- udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+ cps[0] = bi;
+ cps[1] = cnt;
+ cps[2] = B1modb >> cnt;
cps[3] = B2modb >> cnt;
-
- udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
cps[4] = B3modb >> cnt;
-
- udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
cps[5] = B4modb >> cnt;
-
-#if WANT_ASSERT
- {
- int i;
- b = cps[2];
- for (i = 3; i <= 5; i++)
- {
- b += cps[i];
- ASSERT (b >= cps[i]);
- }
- }
-#endif
}
mp_limb_t
-mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6])
+mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t cps[6])
{
- mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+ mp_limb_t rh, rl, bi, q, ph, pl, ch, cl, r;
mp_limb_t B1modb, B2modb, B3modb, B4modb;
mp_size_t i;
int cnt;
- ASSERT (n >= 1);
-
B1modb = cps[2];
B2modb = cps[3];
B3modb = cps[4];
B4modb = cps[5];
- /* We compute n mod 3 in a tricky way, which works except for when n is so
- close to the maximum size that we don't need to support it. The final
- cast to int is a workaround for HP cc. */
- switch ((int) ((mp_limb_t) n * MODLIMB_INVERSE_3 >> (GMP_NUMB_BITS - 2)))
- {
- case 0:
- umul_ppmm (ph, pl, ap[n - 2], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
- umul_ppmm (rh, rl, ap[n - 1], B2modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 3;
- break;
- case 2: /* n mod 3 = 1 */
- rh = 0;
- rl = ap[n - 1];
- n -= 1;
- break;
- case 1: /* n mod 3 = 2 */
- rh = ap[n - 1];
- rl = ap[n - 2];
- n -= 2;
- break;
- }
+ umul_ppmm (ph, pl, ap[n - 2], B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
+ umul_ppmm (ch, cl, ap[n - 1], B2modb);
+ add_ssaaaa (rh, rl, ph, pl, ch, cl);
- for (i = n - 3; i >= 0; i -= 3)
+ for (i = n - 6; i >= 0; i -= 3)
{
/* rr = ap[i] < B
+ ap[i+1] * (B mod b) <= (B-1)(b-1)
@@ -132,7 +84,7 @@ mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6])
+ HI(rr) * (B^4 mod b) <= (B-1)(b-1)
*/
umul_ppmm (ph, pl, ap[i + 1], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
umul_ppmm (ch, cl, ap[i + 2], B2modb);
add_ssaaaa (ph, pl, ph, pl, ch, cl);
@@ -144,14 +96,35 @@ mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6])
add_ssaaaa (rh, rl, rh, rl, ph, pl);
}
- umul_ppmm (rh, cl, rh, B1modb);
- add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+ if (i >= -2)
+ {
+ umul_ppmm (ph, pl, rl, B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 2]);
+ umul_ppmm (rh, rl, rh, B2modb);
+ add_ssaaaa (rh, rl, rh, rl, ph, pl);
+ if (i >= -1)
+ {
+ umul_ppmm (ph, pl, rl, B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[0]);
+ umul_ppmm (rh, rl, rh, B2modb);
+ add_ssaaaa (rh, rl, rh, rl, ph, pl);
+ }
+ }
- cnt = cps[1];
bi = cps[0];
+ cnt = cps[1];
+#if 1
+ umul_ppmm (rh, cl, rh, B1modb);
+ add_ssaaaa (rh, rl, rh, rl, 0, cl);
r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
- udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+#else
+ udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
+ (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
+ ASSERT (q <= 3); /* optimize for small quotient? */
+#endif
+
+ udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
return r >> cnt;
}
diff --git a/gmp/mpn/generic/mod_1_4.c b/gmp/mpn/generic/mod_1_4.c
index 716a0c66de..74893386a9 100644
--- a/gmp/mpn/generic/mod_1_4.c
+++ b/gmp/mpn/generic/mod_1_4.c
@@ -1,41 +1,29 @@
-/* mpn_mod_1s_4p (ap, n, b, cps)
+/* mpn_mod_1s_3p (ap, n, b, cps)
Divide (ap,,n) by b. Return the single-limb remainder.
Requires that d < B / 4.
Contributed to the GNU project by Torbjorn Granlund.
- Based on a suggestion by Peter L. Montgomery.
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2008-2010 Free Software Foundation, Inc.
+Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -48,92 +36,53 @@ mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
int cnt;
- ASSERT (b <= (~(mp_limb_t) 0) / 4);
+ ASSERT (b <= GMP_NUMB_MAX / 4);
count_leading_zeros (cnt, b);
b <<= cnt;
invert_limb (bi, b);
- cps[0] = bi;
- cps[1] = cnt;
-
B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
+ udiv_rnd_preinv (B2modb, B1modb, b, bi);
+ udiv_rnd_preinv (B3modb, B2modb, b, bi);
+ udiv_rnd_preinv (B4modb, B3modb, b, bi);
+ udiv_rnd_preinv (B5modb, B4modb, b, bi);
- udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+ cps[0] = bi;
+ cps[1] = cnt;
+ cps[2] = B1modb >> cnt;
cps[3] = B2modb >> cnt;
-
- udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
cps[4] = B3modb >> cnt;
-
- udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
cps[5] = B4modb >> cnt;
-
- udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
cps[6] = B5modb >> cnt;
-
-#if WANT_ASSERT
- {
- int i;
- b = cps[2];
- for (i = 3; i <= 6; i++)
- {
- b += cps[i];
- ASSERT (b >= cps[i]);
- }
- }
-#endif
}
mp_limb_t
-mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
+mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t cps[7])
{
- mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+ mp_limb_t rh, rl, bi, q, ph, pl, ch, cl, r;
mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
mp_size_t i;
int cnt;
- ASSERT (n >= 1);
-
B1modb = cps[2];
B2modb = cps[3];
B3modb = cps[4];
B4modb = cps[5];
B5modb = cps[6];
- switch (n & 3)
- {
- case 0:
- umul_ppmm (ph, pl, ap[n - 3], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
- umul_ppmm (ch, cl, ap[n - 2], B2modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
- umul_ppmm (rh, rl, ap[n - 1], B3modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 4;
- break;
- case 1:
- rh = 0;
- rl = ap[n - 1];
- n -= 1;
- break;
- case 2:
- rh = ap[n - 1];
- rl = ap[n - 2];
- n -= 2;
- break;
- case 3:
- umul_ppmm (ph, pl, ap[n - 2], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
- umul_ppmm (rh, rl, ap[n - 1], B2modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 3;
- break;
- }
+ umul_ppmm (ph, pl, ap[n - 3], B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
+
+ umul_ppmm (ch, cl, ap[n - 2], B2modb);
+ add_ssaaaa (ph, pl, ph, pl, ch, cl);
- for (i = n - 4; i >= 0; i -= 4)
+ umul_ppmm (ch, cl, ap[n - 1], B3modb);
+ add_ssaaaa (rh, rl, ph, pl, ch, cl);
+
+ for (i = n - 8; i >= 0; i -= 4)
{
/* rr = ap[i] < B
+ ap[i+1] * (B mod b) <= (B-1)(b-1)
@@ -143,7 +92,7 @@ mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
+ HI(rr) * (B^5 mod b) <= (B-1)(b-1)
*/
umul_ppmm (ph, pl, ap[i + 1], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
umul_ppmm (ch, cl, ap[i + 2], B2modb);
add_ssaaaa (ph, pl, ph, pl, ch, cl);
@@ -158,14 +107,42 @@ mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
add_ssaaaa (rh, rl, rh, rl, ph, pl);
}
- umul_ppmm (rh, cl, rh, B1modb);
- add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+ if (i >= -3)
+ {
+ umul_ppmm (ph, pl, rl, B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 3]);
+ umul_ppmm (rh, rl, rh, B2modb);
+ add_ssaaaa (rh, rl, rh, rl, ph, pl);
+ if (i >= -2)
+ {
+ umul_ppmm (ph, pl, rl, B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 2]);
+ umul_ppmm (rh, rl, rh, B2modb);
+ add_ssaaaa (rh, rl, rh, rl, ph, pl);
+ if (i >= -1)
+ {
+ umul_ppmm (ph, pl, rl, B1modb);
+ add_ssaaaa (ph, pl, ph, pl, 0, ap[0]);
+ umul_ppmm (rh, rl, rh, B2modb);
+ add_ssaaaa (rh, rl, rh, rl, ph, pl);
+ }
+ }
+ }
- cnt = cps[1];
bi = cps[0];
+ cnt = cps[1];
+#if 1
+ umul_ppmm (rh, cl, rh, B1modb);
+ add_ssaaaa (rh, rl, rh, rl, 0, cl);
r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
- udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+#else
+ udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
+ (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
+ ASSERT (q <= 4); /* optimize for small quotient? */
+#endif
+
+ udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
return r >> cnt;
}
diff --git a/gmp/mpn/generic/mod_34lsub1.c b/gmp/mpn/generic/mod_34lsub1.c
index 7c07af7acc..6bd149892d 100644
--- a/gmp/mpn/generic/mod_34lsub1.c
+++ b/gmp/mpn/generic/mod_34lsub1.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2002 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
diff --git a/gmp/mpn/generic/mode1o.c b/gmp/mpn/generic/mode1o.c
index ec91da223d..064becdadf 100644
--- a/gmp/mpn/generic/mode1o.c
+++ b/gmp/mpn/generic/mode1o.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2004 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -41,7 +30,7 @@ see https://www.gnu.org/licenses/. */
r*B^k + a - c == q*d
- where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
+ where B=2^BITS_PER_MP_LIMB, a is {src,size}, k is either size or size-1
(the caller won't know which), and q is the quotient (discarded). d must
be odd, c can be any limb value.
diff --git a/gmp/mpn/generic/mu_bdiv_q.c b/gmp/mpn/generic/mu_bdiv_q.c
index 0a8010ec15..3b5f56d088 100644
--- a/gmp/mpn/generic/mu_bdiv_q.c
+++ b/gmp/mpn/generic/mu_bdiv_q.c
@@ -4,44 +4,40 @@
Contributed to the GNU project by Torbjorn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-/*
- The idea of the algorithm used herein is to compute a smaller inverted value
- than used in the standard Barrett algorithm, and thus save time in the
- Newton iterations, and pay just a small price when using the inverted value
- for developing quotient bits. This algorithm was presented at ICMS 2006.
+/* We use the "misunderstanding algorithm" (MU), discovered by Paul Zimmermann
+ and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
+ Jebelean's bidirectional exact division algorithm.
+
+ The idea of this algorithm is to compute a smaller inverted value than used
+ in the standard Barrett algorithm, and thus save time in the Newton
+ iterations, and pay just a small price when using the inverted value for
+ developing quotient bits.
+
+ Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the
+ "wrap around" trick.
*/
#include "gmp.h"
@@ -53,10 +49,11 @@ see https://www.gnu.org/licenses/. */
Requirements: N >= D
D >= 1
+ N mod D = 0
D odd
dn >= 2
nn >= 2
- scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
+ scratch space as determined by mpn_divexact_itch(nn,dn).
Write quotient to Q = {qp,nn}.
@@ -72,10 +69,10 @@ mpn_mu_bdiv_q (mp_ptr qp,
mp_srcptr dp, mp_size_t dn,
mp_ptr scratch)
{
+ mp_ptr ip;
+ mp_ptr rp;
mp_size_t qn;
mp_size_t in;
- int cy, c0;
- mp_size_t tn, wn;
qn = nn;
@@ -85,52 +82,74 @@ mpn_mu_bdiv_q (mp_ptr qp,
if (qn > dn)
{
mp_size_t b;
+ mp_ptr tp;
+ mp_limb_t cy;
+ int k;
+ mp_size_t m, wn;
+ mp_size_t i;
/* |_______________________| dividend
|________| divisor */
-#define ip scratch /* in */
-#define rp (scratch + in) /* dn or rest >= binvert_itch(in) */
-#define tp (scratch + in + dn) /* dn+in or next_size(dn) */
-#define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */
-
/* Compute an inverse size that is a nice partition of the quotient. */
b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
/* Some notes on allocation:
- When in = dn, R dies when mpn_mullo returns, if in < dn the low in
+ When in = dn, R dies when mpn_mullow returns, if in < dn the low in
limbs of R dies at that point. We could save memory by letting T live
just under R, and let the upper part of T expand into R. These changes
should reduce itch to perhaps 3dn.
*/
- mpn_binvert (ip, dp, in, rp);
+ ip = scratch; /* in limbs */
+ rp = scratch + in; /* dn limbs */
+ tp = scratch + in + dn; /* dn + in limbs FIXME: mpn_fft_next_size */
+ scratch += in; /* Roughly 2in+1 limbs */
+
+ mpn_binvert (ip, dp, in, scratch);
cy = 0;
MPN_COPY (rp, np, dn);
np += dn;
- mpn_mullo_n (qp, rp, ip, in);
+ mpn_mullow_n (qp, rp, ip, in);
qn -= in;
+ if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
+ {
+ k = mpn_fft_best_k (dn, 0);
+ m = mpn_fft_next_size (dn, k);
+ wn = dn + in - m; /* number of wrapped limbs */
+ ASSERT_ALWAYS (wn >= 0); /* could handle this below */
+ }
+
while (qn > in)
{
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
- else
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
{
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, rp, wn);
- mpn_decr_u (tp + wn, c0);
- }
+ /* The two multiplicands are dn and 'in' limbs, with dn >= in.
+ The relevant part of the result will typically partially wrap,
+ and that part will come out as subtracted to the right. The
+ unwrapped part, m-in limbs at the high end of tp, is the lower
+ part of the sought product. The wrapped part, at the low end
+ of tp, will be subtracted from the low part of the partial
+ remainder; we undo that operation with another subtraction. */
+ int c0;
+
+ mpn_mul_fft (tp, m, dp, dn, qp, in, k);
+
+ c0 = mpn_sub_n (tp + m, rp, tp, wn);
+
+ for (i = wn; c0 != 0 && i < in; i++)
+ c0 = tp[i] == GMP_NUMB_MASK;
+ mpn_incr_u (tp + in, c0);
}
-
+ else
+#endif
+ mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
qp += in;
if (dn != in)
{
@@ -145,28 +164,29 @@ mpn_mu_bdiv_q (mp_ptr qp,
/* Subtract tp[dn+in-1...dn] from dividend. */
cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
np += in;
- mpn_mullo_n (qp, rp, ip, in);
+ mpn_mullow_n (qp, rp, ip, in);
qn -= in;
}
/* Generate last qn limbs.
FIXME: It should be possible to limit precision here, since qn is
typically somewhat smaller than dn. No big gains expected. */
-
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */
- else
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
{
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, rp, wn);
- mpn_decr_u (tp + wn, c0);
- }
- }
+ int c0;
+
+ mpn_mul_fft (tp, m, dp, dn, qp, in, k);
+
+ c0 = mpn_sub_n (tp + m, rp, tp, wn);
+ for (i = wn; c0 != 0 && i < in; i++)
+ c0 = tp[i] == GMP_NUMB_MASK;
+ mpn_incr_u (tp + in, c0);
+ }
+ else
+#endif
+ mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */
qp += in;
if (dn != in)
{
@@ -179,93 +199,57 @@ mpn_mu_bdiv_q (mp_ptr qp,
}
mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
- mpn_mullo_n (qp, rp, ip, qn);
-
-#undef ip
-#undef rp
-#undef tp
-#undef scratch_out
- }
+ mpn_mullow_n (qp, rp, ip, qn);
+ }
else
{
/* |_______________________| dividend
|________________| divisor */
-#define ip scratch /* in */
-#define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
-#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
-
/* Compute half-sized inverse. */
in = qn - (qn >> 1);
- mpn_binvert (ip, dp, in, tp);
+ ip = scratch; /* ceil(qn/2) limbs */
+ rp = scratch + in; /* ceil(qn/2)+qn limbs */
+ scratch += in; /* 2*ceil(qn/2)+2 */
- mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
+ mpn_binvert (ip, dp, in, scratch);
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, qn, qp, in); /* mulhigh */
- else
+ mpn_mullow_n (qp, np, ip, in); /* low `in' quotient limbs */
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (qn, MUL_FFT_MODF_THRESHOLD))
{
- tn = mpn_mulmod_bnm1_next_size (qn);
- mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
- wn = qn + in - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_cmp (tp, np, wn) < 0;
- mpn_decr_u (tp + wn, c0);
- }
+ int k;
+ mp_size_t m;
+
+ k = mpn_fft_best_k (qn, 0);
+ m = mpn_fft_next_size (qn, k);
+ mpn_mul_fft (rp, m, dp, qn, qp, in, k);
+ if (mpn_cmp (np, rp, in) < 0)
+ mpn_incr_u (rp + in, 1);
}
+ else
+#endif
+ mpn_mul (rp, dp, qn, qp, in); /* mulhigh */
- mpn_sub_n (tp, np + in, tp + in, qn - in);
- mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */
-
-#undef ip
-#undef tp
-#undef scratch_out
+ mpn_sub_n (rp, np + in, rp + in, qn - in);
+ mpn_mullow_n (qp + in, rp, ip, qn - in); /* high qn-in quotient limbs */
}
}
mp_size_t
mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
{
- mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
- mp_size_t b;
+ mp_size_t qn;
qn = nn;
if (qn > dn)
{
- b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
- in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- {
- tn = dn + in;
- itch_out = 0;
- }
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
- }
- itch_binvert = mpn_binvert_itch (in);
- itches = dn + tn + itch_out;
- return in + MAX (itches, itch_binvert);
+ return 4 * dn; /* FIXME FIXME FIXME need mpn_fft_next_size */
}
else
{
- in = qn - (qn >> 1);
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- {
- tn = qn + in;
- itch_out = 0;
- }
- else
- {
- tn = mpn_mulmod_bnm1_next_size (qn);
- itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
- }
- itch_binvert = mpn_binvert_itch (in);
- itches = tn + itch_out;
- return in + MAX (itches, itch_binvert);
+ return 2 * qn + 1 + 2; /* FIXME FIXME FIXME need mpn_fft_next_size */
}
}
diff --git a/gmp/mpn/generic/mu_bdiv_qr.c b/gmp/mpn/generic/mu_bdiv_qr.c
index d265440f2b..e66b4a117e 100644
--- a/gmp/mpn/generic/mu_bdiv_qr.c
+++ b/gmp/mpn/generic/mu_bdiv_qr.c
@@ -1,289 +1,51 @@
-/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
- where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q
- and N; all other overlap disallowed.
+/* mpn_mu_bdiv_qr -- divide-and-conquer Hensel division using a variant of
+ Barrett's algorithm, returning quotient and remainder.
- Contributed to the GNU project by Torbjorn Granlund.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2005-2007, 2009, 2010, 2012 Free Software Foundation, Inc.
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-
-/*
- The idea of the algorithm used herein is to compute a smaller inverted value
- than used in the standard Barrett algorithm, and thus save time in the
- Newton iterations, and pay just a small price when using the inverted value
- for developing quotient bits. This algorithm was presented at ICMS 2006.
-*/
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
-/* N = {np,nn}
- D = {dp,dn}
+/* Computes Hensel binary division of {np, 2*n} by {dp, n}.
+
+ Output:
+
+ q = n * d^{-1} mod 2^{qn * GMP_NUMB_BITS},
- Requirements: N >= D
- D >= 1
- D odd
- dn >= 2
- nn >= 2
- scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
+ r = (n - q * d) * 2^{-qn * GMP_NUMB_BITS}
- Write quotient to Q = {qp,nn-dn}.
+ Stores q at qp. Stores the n least significant limbs of r at the high half
+ of np, and returns the borrow from the subtraction n - q*d.
- FIXME: When iterating, perhaps do the small step before loop, not after.
- FIXME: Try to avoid the scalar divisions when computing inverse size.
- FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
- particular, when dn==in, tp and rp could use the same space.
-*/
-mp_limb_t
+ d must be odd. dinv is (-d)^-1 mod 2^GMP_NUMB_BITS. */
+
+void
mpn_mu_bdiv_qr (mp_ptr qp,
mp_ptr rp,
mp_srcptr np, mp_size_t nn,
mp_srcptr dp, mp_size_t dn,
mp_ptr scratch)
{
- mp_size_t qn;
- mp_size_t in;
- mp_limb_t cy, c0;
- mp_size_t tn, wn;
-
- qn = nn - dn;
-
- ASSERT (dn >= 2);
- ASSERT (qn >= 2);
-
- if (qn > dn)
- {
- mp_size_t b;
-
- /* |_______________________| dividend
- |________| divisor */
-
-#define ip scratch /* in */
-#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
-#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
-
- /* Compute an inverse size that is a nice partition of the quotient. */
- b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
- in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
-
- /* Some notes on allocation:
-
- When in = dn, R dies when mpn_mullo returns, if in < dn the low in
- limbs of R dies at that point. We could save memory by letting T live
- just under R, and let the upper part of T expand into R. These changes
- should reduce itch to perhaps 3dn.
- */
-
- mpn_binvert (ip, dp, in, tp);
-
- MPN_COPY (rp, np, dn);
- np += dn;
- cy = 0;
-
- while (qn > in)
- {
- mpn_mullo_n (qp, rp, ip, in);
-
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, rp, wn);
- mpn_decr_u (tp + wn, c0);
- }
- }
-
- qp += in;
- qn -= in;
-
- if (dn != in)
- {
- /* Subtract tp[dn-1...in] from partial remainder. */
- cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
- if (cy == 2)
- {
- mpn_incr_u (tp + dn, 1);
- cy = 1;
- }
- }
- /* Subtract tp[dn+in-1...dn] from dividend. */
- cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
- np += in;
- }
-
- /* Generate last qn limbs. */
- mpn_mullo_n (qp, rp, ip, qn);
-
- if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
- wn = dn + qn - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, rp, wn);
- mpn_decr_u (tp + wn, c0);
- }
- }
-
- if (dn != qn)
- {
- cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
- if (cy == 2)
- {
- mpn_incr_u (tp + dn, 1);
- cy = 1;
- }
- }
- return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
-
-#undef ip
-#undef tp
-#undef scratch_out
- }
- else
- {
- /* |_______________________| dividend
- |________________| divisor */
-
-#define ip scratch /* in */
-#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
-#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
-
- /* Compute half-sized inverse. */
- in = qn - (qn >> 1);
-
- mpn_binvert (ip, dp, in, tp);
-
- mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
-
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* mulhigh */
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, np, wn);
- mpn_decr_u (tp + wn, c0);
- }
- }
-
- qp += in;
- qn -= in;
-
- cy = mpn_sub_n (rp, np + in, tp + in, dn);
- mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */
-
- if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
- wn = dn + qn - tn; /* number of wrapped limbs */
- if (wn > 0)
- {
- c0 = mpn_sub_n (tp + tn, tp, rp, wn);
- mpn_decr_u (tp + wn, c0);
- }
- }
-
- cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
- if (cy == 2)
- {
- mpn_incr_u (tp + dn, 1);
- cy = 1;
- }
- return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
-
-#undef ip
-#undef tp
-#undef scratch_out
- }
-}
-
-mp_size_t
-mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
-{
- mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
- mp_size_t b;
-
- qn = nn - dn;
-
- if (qn > dn)
- {
- b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
- in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- {
- tn = dn + in;
- itch_out = 0;
- }
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
- }
- itch_binvert = mpn_binvert_itch (in);
- itches = tn + itch_out;
- return in + MAX (itches, itch_binvert);
- }
- else
- {
- in = qn - (qn >> 1);
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- {
- tn = dn + in;
- itch_out = 0;
- }
- else
- {
- tn = mpn_mulmod_bnm1_next_size (dn);
- itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
- }
- }
- itch_binvert = mpn_binvert_itch (in);
- itches = tn + itch_out;
- return in + MAX (itches, itch_binvert);
+ ASSERT_ALWAYS (0);
}
diff --git a/gmp/mpn/generic/mu_div_q.c b/gmp/mpn/generic/mu_div_q.c
index 8768ba6c60..150e8b77cd 100644
--- a/gmp/mpn/generic/mu_div_q.c
+++ b/gmp/mpn/generic/mu_div_q.c
@@ -1,46 +1,29 @@
-/* mpn_mu_div_q.
+/* mpn_mu_div_q, mpn_preinv_mu_div_q.
- Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+ Contributed to the GNU project by Torbjörn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2005-2007, 2009, 2010, 2013 Free Software Foundation, Inc.
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-/*
- The idea of the algorithm used herein is to compute a smaller inverted value
- than used in the standard Barrett algorithm, and thus save time in the
- Newton iterations, and pay just a small price when using the inverted value
- for developing quotient bits. This algorithm was presented at ICMS 2006.
-*/
/*
Things to work on:
@@ -48,14 +31,18 @@ see https://www.gnu.org/licenses/. */
1. This is a rudimentary implementation of mpn_mu_div_q. The algorithm is
probably close to optimal, except when mpn_mu_divappr_q fails.
- 2. We used to fall back to mpn_mu_div_qr when we detect a possible
- mpn_mu_divappr_q rounding problem, now we multiply and compare.
+ An alternative which could be considered for much simpler code for the
+ complex qn>=dn arm would be to allocate a temporary nn+1 limb buffer, then
+ simply call mpn_mu_divappr_q. Such a temporary allocation is
+ unfortunately very large.
+
+ 2. Instead of falling back to mpn_mu_div_qr when we detect a possible
+ mpn_mu_divappr_q rounding problem, we could multiply and compare.
Unfortunately, since mpn_mu_divappr_q does not return the partial
- remainder, this also doesn't become optimal. A mpn_mu_divappr_qr could
- solve that.
+ remainder, this also doesn't become optimal. A mpn_mu_divappr_qr
+ could solve that.
- 3. The allocations done here should be made from the scratch area, which
- then would need to be amended.
+ 3. The allocations done here should be made from the scratch area.
*/
#include <stdlib.h> /* for NULL */
@@ -65,13 +52,13 @@ see https://www.gnu.org/licenses/. */
mp_limb_t
mpn_mu_div_q (mp_ptr qp,
- mp_srcptr np, mp_size_t nn,
+ mp_ptr np, mp_size_t nn,
mp_srcptr dp, mp_size_t dn,
mp_ptr scratch)
{
- mp_ptr tp, rp;
- mp_size_t qn;
- mp_limb_t cy, qh;
+ mp_ptr tp, rp, ip, this_ip;
+ mp_size_t qn, in, this_in;
+ mp_limb_t cy;
TMP_DECL;
TMP_MARK;
@@ -82,28 +69,59 @@ mpn_mu_div_q (mp_ptr qp,
if (qn >= dn) /* nn >= 2*dn + 1 */
{
- /* |_______________________| dividend
- |________| divisor */
+ /* Find max inverse size needed by the two preinv calls. */
+ if (dn != qn)
+ {
+ mp_size_t in1, in2;
- rp = TMP_BALLOC_LIMBS (nn + 1);
- MPN_COPY (rp + 1, np, nn);
- rp[0] = 0;
+ in1 = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
+ in2 = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
+ in = MAX (in1, in2);
+ }
+ else
+ {
+ in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
+ }
- qh = mpn_cmp (rp + 1 + nn - dn, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (rp + 1 + nn - dn, rp + 1 + nn - dn, dp, dn);
+ ip = TMP_BALLOC_LIMBS (in + 1);
- cy = mpn_mu_divappr_q (tp, rp, nn + 1, dp, dn, scratch);
+ if (dn == in)
+ {
+ MPN_COPY (scratch + 1, dp, in);
+ scratch[0] = 1;
+ mpn_invert (ip, scratch, in + 1, NULL);
+ MPN_COPY_INCR (ip, ip + 1, in);
+ }
+ else
+ {
+ cy = mpn_add_1 (scratch, dp + dn - (in + 1), in + 1, 1);
+ if (UNLIKELY (cy != 0))
+ MPN_ZERO (ip, in);
+ else
+ {
+ mpn_invert (ip, scratch, in + 1, NULL);
+ MPN_COPY_INCR (ip, ip + 1, in);
+ }
+ }
- if (UNLIKELY (cy != 0))
+ /* |_______________________| dividend
+ |________| divisor */
+ rp = TMP_BALLOC_LIMBS (2 * dn + 1);
+ if (dn != qn) /* FIXME: perhaps mpn_mu_div_qr should DTRT */
{
- /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
- canonically reduced, replace the returned value of B^(qn-dn)+eps
- by the largest possible value. */
- mp_size_t i;
- for (i = 0; i < qn + 1; i++)
- tp[i] = GMP_NUMB_MAX;
+ this_in = mpn_mu_div_qr_choose_in (qn - dn, dn, 0);
+ this_ip = ip + in - this_in;
+ mpn_preinv_mu_div_qr (tp + dn + 1, rp + dn + 1, np + dn, qn, dp, dn,
+ this_ip, this_in, scratch);
}
+ else
+ MPN_COPY (rp + dn + 1, np + dn, dn);
+
+ MPN_COPY (rp + 1, np, dn);
+ rp[0] = 0;
+ this_in = mpn_mu_divappr_q_choose_in (dn + 1, dn, 0);
+ this_ip = ip + in - this_in;
+ mpn_preinv_mu_divappr_q (tp, rp, 2*dn + 1, dp, dn, this_ip, this_in, scratch);
/* The max error of mpn_mu_divappr_q is +4. If the low quotient limb is
greater than the max error, we cannot trust the quotient. */
@@ -113,73 +131,27 @@ mpn_mu_div_q (mp_ptr qp,
}
else
{
- mp_limb_t cy;
- mp_ptr pp;
-
- pp = rp;
- mpn_mul (pp, tp + 1, qn, dp, dn);
-
- cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
-
- if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
- qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
- else /* Same as above */
- MPN_COPY (qp, tp + 1, qn);
+ /* Fall back to plain mpn_mu_div_qr. */
+ mpn_mu_div_qr (qp, rp, np, nn, dp, dn, scratch);
}
}
else
{
/* |_______________________| dividend
|________________| divisor */
+ mpn_mu_divappr_q (tp, np + nn - (2*qn + 2), 2*qn + 2, dp + dn - (qn + 1), qn + 1, scratch);
- /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
- here becomes 2dn, i.e., more than nn. This shouldn't hurt, since only
- the most significant dn-1 limbs will actually be read, but it is not
- pretty. */
-
- qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
- dp + dn - (qn + 1), qn + 1, scratch);
-
- /* The max error of mpn_mu_divappr_q is +4, but we get an additional
- error from the divisor truncation. */
- if (tp[0] > 6)
+ if (tp[0] > 4)
{
MPN_COPY (qp, tp + 1, qn);
}
else
{
- mp_limb_t cy;
-
- /* FIXME: a shorter product should be enough; we may use already
- allocated space... */
- rp = TMP_BALLOC_LIMBS (nn);
- mpn_mul (rp, dp, dn, tp + 1, qn);
-
- cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
-
- if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
- qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
- else /* Same as above */
- MPN_COPY (qp, tp + 1, qn);
+ rp = TMP_BALLOC_LIMBS (dn);
+ mpn_mu_div_qr (qp, rp, np, nn, dp, dn, scratch);
}
}
TMP_FREE;
- return qh;
-}
-
-mp_size_t
-mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
-{
- mp_size_t qn;
-
- qn = nn - dn;
- if (qn >= dn)
- {
- return mpn_mu_divappr_q_itch (nn + 1, dn, mua_k);
- }
- else
- {
- return mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
- }
+ return 0;
}
diff --git a/gmp/mpn/generic/mu_div_qr.c b/gmp/mpn/generic/mu_div_qr.c
index f4700a1ea6..9049e5907a 100644
--- a/gmp/mpn/generic/mu_div_qr.c
+++ b/gmp/mpn/generic/mu_div_qr.c
@@ -7,67 +7,87 @@
Contributed to the GNU project by Torbjorn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann
+ and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
+ Jebelean's bidirectional exact division algorithm.
+ The idea of this algorithm is to compute a smaller inverted value than used
+ in the standard Barrett algorithm, and thus save time in the Newton
+ iterations, and pay just a small price when using the inverted value for
+ developing quotient bits.
-/*
- The idea of the algorithm used herein is to compute a smaller inverted value
- than used in the standard Barrett algorithm, and thus save time in the
- Newton iterations, and pay just a small price when using the inverted value
- for developing quotient bits. This algorithm was presented at ICMS 2006.
+ Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the
+ "wrap around" trick. Based on the GMP divexact code and inspired by code
+ contributed to GMP by Karl Hasselstroem.
*/
-/* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.
+
+/* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockstep.
Things to work on:
+ * Passing k isn't a great interface. Either 'in' should be passed, or
+ determined by the code.
+
+ * The current mpn_mu_div_qr_itch isn't exactly scientifically written.
+ Scratch space buffer overruns are not unlikely before some analysis is
+ applied. Since scratch requirements are expected to change, such an
+ analysis will have to wait til things settle.
+
+ * This isn't optimal when the remainder isn't needed, since the final
+ multiplication could be made special and take O(1) time on average, in that
+ case. This is particularly bad when qn << dn. At some level, code as in
+ GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn
+ dividend limbs by the qn divisor limbs.
+
* This isn't optimal when the quotient isn't needed, as it might take a lot
- of space. The computation is always needed, though, so there is no time to
- save with special code.
+ of space. The computation is always needed, though, so there is not time
+ to save with special code.
* The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
- demonstrated by the fact that the mpn_invertappr function's scratch needs
- mean that we need to keep a large allocation long after it is needed.
- Things are worse as mpn_mul_fft does not accept any scratch parameter,
- which means we'll have a large memory hole while in mpn_mul_fft. In
- general, a peak scratch need in the beginning of a function isn't
- well-handled by the itch/scratch scheme.
+ demonstrated by the fact that the mpn_inv function's scratch needs means
+ that we need to keep a large allocation long after it is needed. Things
+ are worse as mpn_mul_fft does not accept any scratch parameter, which means
+ we'll have a large memory hole while in mpn_mul_fft. In general, a peak
+ scratch need in the beginning of a function isn't well-handled by the
+ itch/scratch scheme.
+
+ * Some ideas from comments in divexact.c apply to this code too.
*/
+/* the NOSTAT stuff handles properly the case where files are concatenated */
+#ifdef NOSTAT
+#undef STAT
+#endif
+
#ifdef STAT
#undef STAT
#define STAT(x) x
#else
+#define NOSTAT
#define STAT(x)
#endif
@@ -76,98 +96,65 @@ see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
-/* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly. It gives a
- speedup according to old measurements, but does the decision mechanism
- really make sense? It seem like the quotient between dn and qn might be
- what we really should be checking. */
-#ifndef MU_DIV_QR_SKEW_THRESHOLD
-#define MU_DIV_QR_SKEW_THRESHOLD 100
-#endif
-
-#ifdef CHECK /* FIXME: Enable in minithres */
-#undef MU_DIV_QR_SKEW_THRESHOLD
-#define MU_DIV_QR_SKEW_THRESHOLD 1
-#endif
-
-
-static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
-
-
-mp_limb_t
-mpn_mu_div_qr (mp_ptr qp,
- mp_ptr rp,
- mp_srcptr np,
- mp_size_t nn,
- mp_srcptr dp,
- mp_size_t dn,
- mp_ptr scratch)
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+ (a) dn < qn: in = ceil(qn / ceil(qn/dn))
+ (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+ (c) qn < dn/3: in = qn
+ In all cases we have in <= dn.
+ */
+mp_size_t
+mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
{
- mp_size_t qn;
- mp_limb_t cy, qh;
+ mp_size_t in;
- qn = nn - dn;
- if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)
+ if (k == 0)
{
- /* |______________|_ign_first__| dividend nn
- |_______|_ign_first__| divisor dn
-
- |______| quotient (prel) qn
-
- |___________________| quotient * ignored-divisor-part dn-1
- */
-
- /* Compute a preliminary quotient and a partial remainder by dividing the
- most significant limbs of each operand. */
- qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
- np + nn - (2 * qn + 1), 2 * qn + 1,
- dp + dn - (qn + 1), qn + 1,
- scratch);
-
- /* Multiply the quotient by the divisor limbs ignored above. */
- if (dn - (qn + 1) > qn)
- mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */
- else
- mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */
-
- if (qh)
- cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));
+ mp_size_t b;
+ if (qn > dn)
+ {
+ /* Compute an inverse size that is a nice partition of the quotient. */
+ b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
+ in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+ }
+ else if (3 * qn > dn)
+ {
+ in = (qn - 1) / 2 + 1; /* b = 2 */
+ }
else
- cy = 0;
- scratch[dn - 1] = cy;
-
- cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
- cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
- rp + nn - (2 * qn + 1),
- scratch + nn - (2 * qn + 1),
- qn + 1, cy);
- if (cy)
{
- qh -= mpn_sub_1 (qp, qp, qn, 1);
- mpn_add_n (rp, rp, dp, dn);
+ in = (qn - 1) / 1 + 1; /* b = 1 */
}
}
else
{
- qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
+ mp_size_t xn;
+ xn = MIN (dn, qn);
+ in = (xn - 1) / k + 1;
}
- return qh;
+ return in;
}
static mp_limb_t
mpn_mu_div_qr2 (mp_ptr qp,
mp_ptr rp,
- mp_srcptr np,
+ mp_ptr np,
mp_size_t nn,
mp_srcptr dp,
mp_size_t dn,
mp_ptr scratch)
{
mp_size_t qn, in;
- mp_limb_t cy, qh;
+ mp_limb_t cy;
mp_ptr ip, tp;
- ASSERT (dn > 1);
+ /* FIXME: We should probably not handle tiny operands, but do it for now. */
+ if (dn == 1)
+ {
+ rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);
+ MPN_COPY (qp, scratch, nn - 1);
+ return scratch[nn - 1];
+ }
qn = nn - dn;
@@ -178,7 +165,7 @@ mpn_mu_div_qr2 (mp_ptr qp,
#if 1
/* This alternative inverse computation method gets slightly more accurate
results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function
- not adapted (3) mpn_invertappr scratch needs not met. */
+ not adapted (3) mpn_invert scratch needs not met. */
ip = scratch;
tp = scratch + in + 1;
@@ -187,7 +174,7 @@ mpn_mu_div_qr2 (mp_ptr qp,
{
MPN_COPY (tp + 1, dp, in);
tp[0] = 1;
- mpn_invertappr (ip, tp, in + 1, NULL);
+ mpn_invert (ip, tp, in + 1, NULL);
MPN_COPY_INCR (ip, ip + 1, in);
}
else
@@ -197,7 +184,7 @@ mpn_mu_div_qr2 (mp_ptr qp,
MPN_ZERO (ip, in);
else
{
- mpn_invertappr (ip, tp, in + 1, NULL);
+ mpn_invert (ip, tp, in + 1, NULL);
MPN_COPY_INCR (ip, ip + 1, in);
}
}
@@ -213,11 +200,11 @@ mpn_mu_div_qr2 (mp_ptr qp,
{
tp[in + 1] = 0;
MPN_COPY (tp + in + 2, dp, in);
- mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
+ mpn_invert (tp, tp + in + 1, in + 1, NULL);
}
else
{
- mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
+ mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);
}
cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
if (UNLIKELY (cy != 0))
@@ -225,15 +212,24 @@ mpn_mu_div_qr2 (mp_ptr qp,
MPN_COPY (ip, tp + 1, in);
#endif
- qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
+/* We can't really handle qh = 1 like this since we'd here clobber N, which is
+ not allowed in the way we've defined this function's API. */
+#if 0
+ qh = mpn_cmp (np + qn, dp, dn) >= 0;
+ if (qh != 0)
+ mpn_sub_n (np + qn, np + qn, dp, dn);
+#endif
- return qh;
+ mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
+
+/* return qh; */
+ return 0;
}
-mp_limb_t
+void
mpn_preinv_mu_div_qr (mp_ptr qp,
mp_ptr rp,
- mp_srcptr np,
+ mp_ptr np,
mp_size_t nn,
mp_srcptr dp,
mp_size_t dn,
@@ -242,26 +238,24 @@ mpn_preinv_mu_div_qr (mp_ptr qp,
mp_ptr scratch)
{
mp_size_t qn;
- mp_limb_t cy, cx, qh;
+ mp_limb_t cy;
+ mp_ptr tp;
mp_limb_t r;
- mp_size_t tn, wn;
-
-#define tp scratch
-#define scratch_out (scratch + tn)
qn = nn - dn;
+ if (qn == 0)
+ {
+ MPN_COPY (rp, np, dn);
+ return;
+ }
+
+ tp = scratch;
+
np += qn;
qp += qn;
- qh = mpn_cmp (np, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (rp, np, dp, dn);
- else
- MPN_COPY_INCR (rp, np, dn);
-
- if (qn == 0)
- return qh; /* Degenerate use. Should we allow this? */
+ MPN_COPY (rp, np, dn);
while (qn > 0)
{
@@ -277,30 +271,36 @@ mpn_preinv_mu_div_qr (mp_ptr qp,
by the upper part of the partial remainder R. */
mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */
cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */
- ASSERT_ALWAYS (cy == 0);
-
- qn -= in;
+ ASSERT_ALWAYS (cy == 0); /* FIXME */
/* Compute the product of the quotient block and the divisor D, to be
subtracted from the partial remainder combined with new limbs from the
- dividend N. We only really need the low dn+1 limbs. */
-
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
- else
+ dividend N. We only really need the low dn limbs. */
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
{
- tn = mpn_mulmod_bnm1_next_size (dn + 1);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
+ /* Use the wrap-around trick. */
+ mp_size_t m, wn;
+ int k;
+
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ wn = dn + in - m; /* number of wrapped limbs */
+
+ mpn_mul_fft (tp, m, dp, dn, qp, in, k);
+
if (wn > 0)
{
- cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
- cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
- cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
- ASSERT_ALWAYS (cx >= cy);
- mpn_incr_u (tp, cx - cy);
+ cy = mpn_add_n (tp, tp, rp + dn - wn, wn);
+ mpn_incr_u (tp + wn, cy);
+
+ cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;
+ mpn_decr_u (tp, cy);
}
}
+ else
+#endif
+ mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
r = rp[dn - in] - tp[dn];
@@ -352,65 +352,112 @@ mpn_preinv_mu_div_qr (mp_ptr qp,
printf ("\n");
}
);
- }
- return qh;
+ qn -= in;
+ }
}
-/* In case k=0 (automatic choice), we distinguish 3 cases:
- (a) dn < qn: in = ceil(qn / ceil(qn/dn))
- (b) dn/3 < qn <= dn: in = ceil(qn / 2)
- (c) qn < dn/3: in = qn
- In all cases we have in <= dn.
- */
-mp_size_t
-mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
+#define THRES 100 /* FIXME: somewhat arbitrary */
+
+#ifdef CHECK
+#undef THRES
+#define THRES 1
+#endif
+
+mp_limb_t
+mpn_mu_div_qr (mp_ptr qp,
+ mp_ptr rp,
+ mp_ptr np,
+ mp_size_t nn,
+ mp_srcptr dp,
+ mp_size_t dn,
+ mp_ptr scratch)
{
- mp_size_t in;
+ mp_size_t qn;
- if (k == 0)
+ qn = nn - dn;
+ if (qn + THRES < dn)
{
- mp_size_t b;
- if (qn > dn)
- {
- /* Compute an inverse size that is a nice partition of the quotient. */
- b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
- in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
- }
- else if (3 * qn > dn)
+ /* |______________|________| dividend nn
+ |_______|________| divisor dn
+
+ |______| quotient (prel) qn
+
+ |_______________| quotient * ignored-part-of(divisor) dn-1
+ */
+
+ mp_limb_t cy, x;
+
+ if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0)
{
- in = (qn - 1) / 2 + 1; /* b = 2 */
+ /* Quotient is 111...111, could optimize this rare case at some point. */
+ mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
+ return 0;
}
+
+ /* Compute a preliminary quotient and a partial remainder by dividing the
+ most significant limbs of each operand. */
+ mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
+ np + nn - (2 * qn + 1), 2 * qn + 1,
+ dp + dn - (qn + 1), qn + 1,
+ scratch);
+
+ /* Multiply the quotient by the divisor limbs ignored above. */
+ if (dn - (qn + 1) > qn)
+ mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */
else
+ mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */
+
+ cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
+ cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
+ rp + nn - (2 * qn + 1),
+ scratch + nn - (2 * qn + 1),
+ qn, cy);
+ x = rp[dn - 1];
+ rp[dn - 1] = x - cy;
+ if (cy > x)
{
- in = (qn - 1) / 1 + 1; /* b = 1 */
+ mpn_decr_u (qp, 1);
+ mpn_add_n (rp, rp, dp, dn);
}
}
else
{
- mp_size_t xn;
- xn = MIN (dn, qn);
- in = (xn - 1) / k + 1;
+ return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
}
- return in;
+ return 0; /* FIXME */
}
mp_size_t
mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
{
- mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
- mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
- mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+ mp_size_t qn, m;
+ int k;
- return in + itch_local + itch_out;
-}
+ /* FIXME: This isn't very carefully written, and might grossly overestimate
+ the amount of scratch needed, and might perhaps also underestimate it,
+ leading to potential buffer overruns. In particular k=0 might lead to
+ gross overestimates. */
-mp_size_t
-mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)
-{
- mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
- mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+ if (dn == 1)
+ return nn;
- return itch_local + itch_out;
+ qn = nn - dn;
+ if (qn >= dn)
+ {
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ return (mua_k <= 1
+ ? 6 * dn
+ : m + 2 * dn);
+ }
+ else
+ {
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ return (mua_k <= 1
+ ? m + 4 * qn
+ : m + 2 * qn);
+ }
}
diff --git a/gmp/mpn/generic/mu_divappr_q.c b/gmp/mpn/generic/mu_divappr_q.c
index c218b59fee..0a0434399f 100644
--- a/gmp/mpn/generic/mu_divappr_q.c
+++ b/gmp/mpn/generic/mu_divappr_q.c
@@ -7,63 +7,87 @@
Contributed to the GNU project by Torbjorn Granlund.
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
-Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann
+ and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
+ Jebelean's bidirectional exact division algorithm.
-/*
- The idea of the algorithm used herein is to compute a smaller inverted value
- than used in the standard Barrett algorithm, and thus save time in the
- Newton iterations, and pay just a small price when using the inverted value
- for developing quotient bits. This algorithm was presented at ICMS 2006.
+ The idea of this algorithm is to compute a smaller inverted value than used
+ in the standard Barrett algorithm, and thus save time in the Newton
+ iterations, and pay just a small price when using the inverted value for
+ developing quotient bits.
+
+ Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the
+ "wrap around" trick. Based on the GMP divexact code and inspired by code
+ contributed to GMP by Karl Hasselstroem.
*/
-/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
+
+/* CAUTION: This code and the code in mu_div_qr.c should be edited in lockstep.
Things to work on:
+ * Passing k isn't a great interface. Either 'in' should be passed, or
+ determined by the code.
+
+ * The current mpn_mu_div_qr_itch isn't exactly scientifically written.
+ Scratch space buffer overruns are not unlikely before some analysis is
+ applied. Since scratch requirements are expected to change, such an
+ analysis will have to wait til things settle.
+
+ * This isn't optimal when the remainder isn't needed, since the final
+ multiplication could be made special and take O(1) time on average, in that
+ case. This is particularly bad when qn << dn. At some level, code as in
+ GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn
+ dividend limbs by the qn divisor limbs.
+
+ * This isn't optimal when the quotient isn't needed, as it might take a lot
+ of space. The computation is always needed, though, so there is not time
+ to save with special code.
+
* The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
- demonstrated by the fact that the mpn_invertappr function's scratch needs
- mean that we need to keep a large allocation long after it is needed.
- Things are worse as mpn_mul_fft does not accept any scratch parameter,
- which means we'll have a large memory hole while in mpn_mul_fft. In
- general, a peak scratch need in the beginning of a function isn't
- well-handled by the itch/scratch scheme.
+ demonstrated by the fact that the mpn_inv function's scratch needs means
+ that we need to keep a large allocation long after it is needed. Things
+ are worse as mpn_mul_fft does not accept any scratch parameter, which means
+ we'll have a large memory hole while in mpn_mul_fft. In general, a peak
+ scratch need in the beginning of a function isn't well-handled by the
+ itch/scratch scheme.
+
+ * Some ideas from comments in divexact.c apply to this code too.
*/
+/* the NOSTAT stuff handles properly the case where files are concatenated */
+#ifdef NOSTAT
+#undef STAT
+#endif
+
#ifdef STAT
#undef STAT
#define STAT(x) x
#else
+#define NOSTAT
#define STAT(x)
#endif
@@ -72,22 +96,68 @@ see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+ (a) dn < qn: in = ceil(qn / ceil(qn/dn))
+ (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+ (c) qn < dn/3: in = qn
+ In all cases we have in <= dn.
+ */
+mp_size_t
+mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
+{
+ mp_size_t in;
+
+ if (k == 0)
+ {
+ mp_size_t b;
+ if (qn > dn)
+ {
+ /* Compute an inverse size that is a nice partition of the quotient. */
+ b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
+ in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+ }
+ else if (3 * qn > dn)
+ {
+ in = (qn - 1) / 2 + 1; /* b = 2 */
+ }
+ else
+ {
+ in = (qn - 1) / 1 + 1; /* b = 1 */
+ }
+ }
+ else
+ {
+ mp_size_t xn;
+ xn = MIN (dn, qn);
+ in = (xn - 1) / k + 1;
+ }
+
+ return in;
+}
+
mp_limb_t
mpn_mu_divappr_q (mp_ptr qp,
- mp_srcptr np,
+ mp_ptr np,
mp_size_t nn,
mp_srcptr dp,
mp_size_t dn,
mp_ptr scratch)
{
mp_size_t qn, in;
- mp_limb_t cy, qh;
+ mp_limb_t cy;
mp_ptr ip, tp;
- ASSERT (dn > 1);
+ /* FIXME: We should probably not handle tiny operands, but do it for now. */
+ if (dn == 1)
+ {
+ mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);
+ MPN_COPY (qp, scratch, nn - 1);
+ return scratch[nn - 1];
+ }
qn = nn - dn;
+#if 1
/* If Q is smaller than D, truncate operands. */
if (qn + 1 < dn)
{
@@ -95,7 +165,18 @@ mpn_mu_divappr_q (mp_ptr qp,
nn -= dn - (qn + 1);
dp += dn - (qn + 1);
dn = qn + 1;
+
+ /* Since D is cut here, we can have a carry in N'/D' even if we don't
+ have it for N/D. */
+ if (mpn_cmp (np + nn - (qn + 1), dp, qn + 1) >= 0)
+ { /* quotient is 111...111 */
+ mp_size_t i;
+ for (i = 0; i <= qn; i ++)
+ qp[i] = ~ (mp_limb_t) 0;
+ return 0;
+ }
}
+#endif
/* Compute the inverse size. */
in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
@@ -104,7 +185,7 @@ mpn_mu_divappr_q (mp_ptr qp,
#if 1
/* This alternative inverse computation method gets slightly more accurate
results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function
- not adapted (3) mpn_invertappr scratch needs not met. */
+ not adapted (3) mpn_invert scratch needs not met. */
ip = scratch;
tp = scratch + in + 1;
@@ -113,7 +194,7 @@ mpn_mu_divappr_q (mp_ptr qp,
{
MPN_COPY (tp + 1, dp, in);
tp[0] = 1;
- mpn_invertappr (ip, tp, in + 1, NULL);
+ mpn_invert (ip, tp, in + 1, NULL);
MPN_COPY_INCR (ip, ip + 1, in);
}
else
@@ -123,7 +204,7 @@ mpn_mu_divappr_q (mp_ptr qp,
MPN_ZERO (ip, in);
else
{
- mpn_invertappr (ip, tp, in + 1, NULL);
+ mpn_invert (ip, tp, in + 1, NULL);
MPN_COPY_INCR (ip, ip + 1, in);
}
}
@@ -139,11 +220,11 @@ mpn_mu_divappr_q (mp_ptr qp,
{
tp[in + 1] = 0;
MPN_COPY (tp + in + 2, dp, in);
- mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
+ mpn_invert (tp, tp + in + 1, in + 1, NULL);
}
else
{
- mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
+ mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);
}
cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
if (UNLIKELY (cy != 0))
@@ -151,14 +232,23 @@ mpn_mu_divappr_q (mp_ptr qp,
MPN_COPY (ip, tp + 1, in);
#endif
- qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
+/* We can't really handle qh = 1 like this since we'd here clobber N, which is
+ not allowed in the way we've defined this function's API. */
+#if 0
+ qh = mpn_cmp (np + qn, dp, dn) >= 0;
+ if (qh != 0)
+ mpn_sub_n (np + qn, np + qn, dp, dn);
+#endif
+
+ mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
- return qh;
+/* return qh; */
+ return 0;
}
-mp_limb_t
+void
mpn_preinv_mu_divappr_q (mp_ptr qp,
- mp_srcptr np,
+ mp_ptr np,
mp_size_t nn,
mp_srcptr dp,
mp_size_t dn,
@@ -166,28 +256,24 @@ mpn_preinv_mu_divappr_q (mp_ptr qp,
mp_size_t in,
mp_ptr scratch)
{
+ mp_ptr rp;
mp_size_t qn;
- mp_limb_t cy, cx, qh;
+ mp_limb_t cy;
+ mp_ptr tp;
mp_limb_t r;
- mp_size_t tn, wn;
-
-#define rp scratch
-#define tp (scratch + dn)
-#define scratch_out (scratch + dn + tn)
qn = nn - dn;
+ if (qn == 0)
+ return;
+
+ rp = scratch;
+ tp = scratch + dn;
+
np += qn;
qp += qn;
- qh = mpn_cmp (np, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (rp, np, dp, dn);
- else
- MPN_COPY (rp, np, dn);
-
- if (qn == 0)
- return qh; /* Degenerate use. Should we allow this? */
+ MPN_COPY (rp, np, dn);
while (qn > 0)
{
@@ -203,7 +289,7 @@ mpn_preinv_mu_divappr_q (mp_ptr qp,
by the upper part of the partial remainder R. */
mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */
cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */
- ASSERT_ALWAYS (cy == 0);
+ ASSERT_ALWAYS (cy == 0); /* FIXME */
qn -= in;
if (qn == 0)
@@ -212,23 +298,31 @@ mpn_preinv_mu_divappr_q (mp_ptr qp,
/* Compute the product of the quotient block and the divisor D, to be
subtracted from the partial remainder combined with new limbs from the
dividend N. We only really need the low dn limbs. */
-
- if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
- mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
- else
+#if WANT_FFT
+ if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
{
- tn = mpn_mulmod_bnm1_next_size (dn + 1);
- mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
- wn = dn + in - tn; /* number of wrapped limbs */
+ /* Use the wrap-around trick. */
+ mp_size_t m, wn;
+ int k;
+
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ wn = dn + in - m; /* number of wrapped limbs */
+
+ mpn_mul_fft (tp, m, dp, dn, qp, in, k);
+
if (wn > 0)
{
- cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
- cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
- cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
- ASSERT_ALWAYS (cx >= cy);
- mpn_incr_u (tp, cx - cy);
+ cy = mpn_add_n (tp, tp, rp + dn - wn, wn);
+ mpn_incr_u (tp + wn, cy);
+
+ cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;
+ mpn_decr_u (tp, cy);
}
}
+ else
+#endif
+ mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */
r = rp[dn - in] - tp[dn];
@@ -284,80 +378,45 @@ mpn_preinv_mu_divappr_q (mp_ptr qp,
/* FIXME: We should perhaps be somewhat more elegant in our rounding of the
quotient. For now, just make sure the returned quotient is >= the real
- quotient; add 3 with saturating arithmetic. */
+ quotient. */
qn = nn - dn;
- cy += mpn_add_1 (qp, qp, qn, 3);
+ cy = mpn_add_1 (qp, qp, qn, 3);
if (cy != 0)
{
- if (qh != 0)
- {
- /* Return a quotient of just 1-bits, with qh set. */
- mp_size_t i;
- for (i = 0; i < qn; i++)
- qp[i] = GMP_NUMB_MAX;
- }
- else
- {
- /* Propagate carry into qh. */
- qh = 1;
- }
+ MPN_ZERO (qp, qn);
+ mpn_sub_1 (qp, qp, qn, 1);
}
-
- return qh;
}
-/* In case k=0 (automatic choice), we distinguish 3 cases:
- (a) dn < qn: in = ceil(qn / ceil(qn/dn))
- (b) dn/3 < qn <= dn: in = ceil(qn / 2)
- (c) qn < dn/3: in = qn
- In all cases we have in <= dn.
- */
mp_size_t
-mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
+mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
{
- mp_size_t in;
+ mp_size_t qn, m;
+ int k;
- if (k == 0)
- {
- mp_size_t b;
- if (qn > dn)
- {
- /* Compute an inverse size that is a nice partition of the quotient. */
- b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
- in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
- }
- else if (3 * qn > dn)
- {
- in = (qn - 1) / 2 + 1; /* b = 2 */
- }
- else
- {
- in = (qn - 1) / 1 + 1; /* b = 1 */
- }
- }
- else
- {
- mp_size_t xn;
- xn = MIN (dn, qn);
- in = (xn - 1) / k + 1;
- }
+ /* FIXME: This isn't very carefully written, and might grossly overestimate
+ the amount of scratch needed, and might perhaps also underestimate it,
+ leading to potential buffer overruns. In particular k=0 might lead to
+ gross overestimates. */
- return in;
-}
-
-mp_size_t
-mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
-{
- mp_size_t qn, in, itch_local, itch_out;
+ if (dn == 1)
+ return nn;
qn = nn - dn;
- if (qn + 1 < dn)
+ if (qn >= dn)
{
- dn = qn + 1;
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ return dn + (mua_k <= 1
+ ? 6 * dn
+ : m + 2 * dn);
+ }
+ else
+ {
+ k = mpn_fft_best_k (dn + 1, 0);
+ m = mpn_fft_next_size (dn + 1, k);
+ return dn + (mua_k <= 1
+ ? m + 4 * qn
+ : m + 2 * qn);
}
- in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
-
- itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
- itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
- return in + dn + itch_local + itch_out;
}
diff --git a/gmp/mpn/generic/mul.c b/gmp/mpn/generic/mul.c
index 2d72df3d4d..489e1f524f 100644
--- a/gmp/mpn/generic/mul.c
+++ b/gmp/mpn/generic/mul.c
@@ -2,34 +2,23 @@
Contributed to the GNU project by Torbjorn Granlund.
-Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012
-Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
+2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -39,42 +28,6 @@ see https://www.gnu.org/licenses/. */
#define MUL_BASECASE_MAX_UN 500
#endif
-/* Areas where the different toom algorithms can be called (extracted
- from the t-toom*.c files, and ignoring small constant offsets):
-
- 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un
- 4/7 6/7
- 6/11
- |--------------------| toom22 (small)
- || toom22 (large)
- |xxxx| toom22 called
- |-------------------------------------| toom32
- |xxxxxxxxxxxxxxxx| | toom32 called
- |------------| toom33
- |x| toom33 called
- |---------------------------------| | toom42
- |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called
- |--------------------| toom43
- |xxxxxxxxxx| toom43 called
- |-----------------------------| toom52 (unused)
- |--------| toom44
- |xxxxxxxx| toom44 called
- |--------------------| | toom53
- |xxxxxx| toom53 called
- |-------------------------| toom62 (unused)
- |----------------| toom54 (unused)
- |--------------------| toom63
- |xxxxxxxxx| | toom63 called
- |---------------------------------| toom6h
- |xxxxxxxx| toom6h called
- |-------------------------| toom8h (32 bit)
- |------------------------------------------| toom8h (64 bit)
- |xxxxxxxx| toom8h called
-*/
-
-#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
-#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
-
/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
(pointed to by VP, with VN limbs), and store the result at PRODP. The
result is UN + VN limbs. Return the most significant limb of the result.
@@ -87,34 +40,6 @@ see https://www.gnu.org/licenses/. */
2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
the multiplier and the multiplicand. */
-/*
- * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
- ideal lines of the surrounding algorithms. Is that optimal?
-
- * The toomX3 code now uses a structure similar to the one of toomX2, except
- that it loops longer in the unbalanced case. The result is that the
- remaining area might have un < vn. Should we fix the toomX2 code in a
- similar way?
-
- * The toomX3 code is used for the largest non-FFT unbalanced operands. It
- therefore calls mpn_mul recursively for certain cases.
-
- * Allocate static temp space using THRESHOLD variables (except for toom44
- when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
-
- * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
- have the same vn threshold. This is not true, we should actually use
- mul_basecase for slightly larger operands for toom32 than for toom22, and
- even larger for toom42.
-
- * That problem is even more prevalent for toomX3. We therefore use special
- THRESHOLD variables there.
-
- * Is our ITCH allocation correct?
-*/
-
-#define ITCH (16*vn + 100)
-
mp_limb_t
mpn_mul (mp_ptr prodp,
mp_srcptr up, mp_size_t un,
@@ -128,11 +53,13 @@ mpn_mul (mp_ptr prodp,
if (un == vn)
{
if (up == vp)
- mpn_sqr (prodp, up, un);
+ mpn_sqr_n (prodp, up, un);
else
mpn_mul_n (prodp, up, vp, un);
+ return prodp[2 * un - 1];
}
- else if (vn < MUL_TOOM22_THRESHOLD)
+
+ if (vn < MUL_KARATSUBA_THRESHOLD)
{ /* plain schoolbook multiplication */
/* Unless un is very large, or else if have an applicable mpn_mul_N,
@@ -171,9 +98,9 @@ mpn_mul (mp_ptr prodp,
The parts marked with X are the parts whose sums are copied into
the temporary buffer. */
- mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
+ mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT];
mp_limb_t cy;
- ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
+ ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT);
mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
prodp += MUL_BASECASE_MAX_UN;
@@ -184,7 +111,7 @@ mpn_mul (mp_ptr prodp,
{
mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
- mpn_incr_u (prodp + vn, cy);
+ mpn_incr_u (prodp + vn, cy); /* safe? */
prodp += MUL_BASECASE_MAX_UN;
MPN_COPY (tp, prodp, vn); /* preserve high triangle */
up += MUL_BASECASE_MAX_UN;
@@ -196,233 +123,100 @@ mpn_mul (mp_ptr prodp,
}
else
{
- ASSERT (un > 0);
+ ASSERT_ALWAYS (un > 0);
mpn_mul_basecase (prodp, vp, vn, up, un);
}
cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
- mpn_incr_u (prodp + vn, cy);
+ mpn_incr_u (prodp + vn, cy); /* safe? */
}
+ return prodp[un + vn - 1];
}
- else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
- {
- /* Use ToomX2 variants */
- mp_ptr scratch;
- TMP_SDECL; TMP_SMARK;
-
- scratch = TMP_SALLOC_LIMBS (ITCH);
-
- /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
- square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
- wise; we would get better balance by slightly moving the bound. We
- will sometimes end up with un < vn, like in the X3 arm below. */
- if (un >= 3 * vn)
- {
- mp_limb_t cy;
- mp_ptr ws;
-
- /* The maximum ws usage is for the mpn_mul result. */
- ws = TMP_SALLOC_LIMBS (4 * vn);
- mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
- un -= 2 * vn;
- up += 2 * vn;
- prodp += 2 * vn;
-
- while (un >= 3 * vn)
- {
- mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
- un -= 2 * vn;
- up += 2 * vn;
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, 2 * vn);
- mpn_incr_u (prodp + vn, cy);
- prodp += 2 * vn;
- }
-
- /* vn <= un < 3vn */
-
- if (4 * un < 5 * vn)
- mpn_toom22_mul (ws, up, un, vp, vn, scratch);
- else if (4 * un < 7 * vn)
- mpn_toom32_mul (ws, up, un, vp, vn, scratch);
- else
- mpn_toom42_mul (ws, up, un, vp, vn, scratch);
-
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, un);
- mpn_incr_u (prodp + vn, cy);
- }
- else
- {
- if (4 * un < 5 * vn)
- mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
- else if (4 * un < 7 * vn)
- mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
- else
- mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
- }
- TMP_SFREE;
- }
- else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
- BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
+ if (ABOVE_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) &&
+ ABOVE_THRESHOLD (vn, MUL_FFT_THRESHOLD / 3)) /* FIXME */
{
- /* Handle the largest operands that are not in the FFT range. The 2nd
- condition makes very unbalanced operands avoid the FFT code (except
- perhaps as coefficient products of the Toom code. */
-
- if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
- {
- /* Use ToomX3 variants */
- mp_ptr scratch;
- TMP_SDECL; TMP_SMARK;
-
- scratch = TMP_SALLOC_LIMBS (ITCH);
-
- if (2 * un >= 5 * vn)
- {
- mp_limb_t cy;
- mp_ptr ws;
-
- /* The maximum ws usage is for the mpn_mul result. */
- ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
-
- if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
- mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
- else
- mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
- un -= 2 * vn;
- up += 2 * vn;
- prodp += 2 * vn;
-
- while (2 * un >= 5 * vn) /* un >= 2.5vn */
- {
- if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
- mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
- else
- mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
- un -= 2 * vn;
- up += 2 * vn;
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, 2 * vn);
- mpn_incr_u (prodp + vn, cy);
- prodp += 2 * vn;
- }
-
- /* vn / 2 <= un < 2.5vn */
-
- if (un < vn)
- mpn_mul (ws, vp, vn, up, un);
- else
- mpn_mul (ws, up, un, vp, vn);
-
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, un);
- mpn_incr_u (prodp + vn, cy);
- }
- else
- {
- if (6 * un < 7 * vn)
- mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
- else if (2 * un < 3 * vn)
- {
- if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
- mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
- else
- mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
- }
- else if (6 * un < 11 * vn)
- {
- if (4 * un < 7 * vn)
- {
- if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
- mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
- else
- mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
- }
- else
- {
- if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
- mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
- else
- mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
- }
- }
- else
- {
- if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
- mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
- else
- mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
- }
- }
- TMP_SFREE;
- }
- else
- {
- mp_ptr scratch;
- TMP_DECL; TMP_MARK;
-
- if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
- {
- scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
- mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
- }
- else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
- {
- scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
- mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
- }
- else
- {
- scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
- mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
- }
- TMP_FREE;
- }
+ mpn_mul_fft_full (prodp, up, un, vp, vn);
+ return prodp[un + vn - 1];
}
- else
- {
- if (un >= 8 * vn)
- {
- mp_limb_t cy;
- mp_ptr ws;
- TMP_DECL; TMP_MARK;
-
- /* The maximum ws usage is for the mpn_mul result. */
- ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
-
- mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
- un -= 3 * vn;
- up += 3 * vn;
- prodp += 3 * vn;
- while (2 * un >= 7 * vn) /* un >= 3.5vn */
- {
- mpn_fft_mul (ws, up, 3 * vn, vp, vn);
- un -= 3 * vn;
- up += 3 * vn;
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, 3 * vn);
- mpn_incr_u (prodp + vn, cy);
- prodp += 3 * vn;
- }
-
- /* vn / 2 <= un < 3.5vn */
-
- if (un < vn)
- mpn_mul (ws, vp, vn, up, un);
- else
- mpn_mul (ws, up, un, vp, vn);
-
- cy = mpn_add_n (prodp, prodp, ws, vn);
- MPN_COPY (prodp + vn, ws + vn, un);
- mpn_incr_u (prodp + vn, cy);
-
- TMP_FREE;
- }
- else
- mpn_fft_mul (prodp, up, un, vp, vn);
- }
+ {
+ mp_ptr ws;
+ mp_ptr scratch;
+#if WANT_ASSERT
+ mp_ptr ssssp;
+#endif
+ TMP_DECL;
+ TMP_MARK;
+
+#define WSALL (4 * vn)
+ ws = TMP_SALLOC_LIMBS (WSALL + 1);
+
+#define ITCH ((un + vn) * 4 + 100)
+ scratch = TMP_ALLOC_LIMBS (ITCH + 1);
+#if WANT_ASSERT
+ ssssp = scratch + ITCH;
+ ws[WSALL] = 0xbabecafe;
+ ssssp[0] = 0xbeef;
+#endif
- return prodp[un + vn - 1]; /* historic */
+ if (un >= 3 * vn)
+ {
+ mp_limb_t cy;
+
+ mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ prodp += 2 * vn;
+
+ while (un >= 3 * vn)
+ {
+ mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+ un -= 2 * vn;
+ up += 2 * vn;
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+ mpn_incr_u (prodp + vn, cy);
+ prodp += 2 * vn;
+ }
+
+ if (5 * un > 9 * vn)
+ {
+ mpn_toom42_mul (ws, up, un, vp, vn, scratch);
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+ else if (9 * un > 10 * vn)
+ {
+ mpn_toom32_mul (ws, up, un, vp, vn, scratch);
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+ else
+ {
+ mpn_toom22_mul (ws, up, un, vp, vn, scratch);
+ cy = mpn_add_n (prodp, prodp, ws, vn);
+ MPN_COPY (prodp + vn, ws + vn, un);
+ mpn_incr_u (prodp + vn, cy);
+ }
+
+ ASSERT (ws[WSALL] == 0xbabecafe);
+ ASSERT (ssssp[0] == 0xbeef);
+ TMP_FREE;
+ return prodp[un + vn - 1];
+ }
+
+ if (un * 5 > vn * 9)
+ mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+ else if (9 * un > 10 * vn)
+ mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+ else
+ mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
+
+ ASSERT (ws[WSALL] == 0xbabecafe);
+ ASSERT (ssssp[0] == 0xbeef);
+ TMP_FREE;
+ return prodp[un + vn - 1];
+ }
}
diff --git a/gmp/mpn/generic/mul_1.c b/gmp/mpn/generic/mul_1.c
index 6b2ee59a2c..b8290cc6af 100644
--- a/gmp/mpn/generic/mul_1.c
+++ b/gmp/mpn/generic/mul_1.c
@@ -1,33 +1,23 @@
/* mpn_mul_1 -- Multiply a limb vector with a single limb and store the
product in a second limb vector.
-Copyright 1991-1994, 1996, 2000-2002 Free Software Foundation, Inc.
+Copyright 1991, 1992, 1993, 1994, 1996, 2000, 2001, 2002 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/mul_basecase.c b/gmp/mpn/generic/mul_basecase.c
index 9309ef72c8..4f02545d57 100644
--- a/gmp/mpn/generic/mul_basecase.c
+++ b/gmp/mpn/generic/mul_basecase.c
@@ -4,33 +4,24 @@
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright 1991-1994, 1996, 1997, 2000-2002 Free Software Foundation, Inc.
+
+Copyright 1991, 1992, 1993, 1994, 1996, 1997, 2000, 2001, 2002 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -42,7 +33,7 @@ see https://www.gnu.org/licenses/. */
Note that prodp gets usize+vsize limbs stored, even if the actual result
only needs usize+vsize-1.
- There's no good reason to call here with vsize>=MUL_TOOM22_THRESHOLD.
+ There's no good reason to call here with vsize>=MUL_KARATSUBA_THRESHOLD.
Currently this is allowed, but it might not be in the future.
This is the most critical code for multiplication. All multiplies rely
diff --git a/gmp/mpn/generic/mul_fft.c b/gmp/mpn/generic/mul_fft.c
index 5e763a3a73..836a89a001 100644
--- a/gmp/mpn/generic/mul_fft.c
+++ b/gmp/mpn/generic/mul_fft.c
@@ -6,33 +6,23 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 1998-2010, 2012, 2013 Free Software Foundation, Inc.
+Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* References:
@@ -70,79 +60,76 @@ see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#ifdef WANT_ADDSUB
-#include "generic/add_n_sub_n.c"
-#define HAVE_NATIVE_mpn_add_n_sub_n 1
+#include "generic/addsub_n.c"
+#define HAVE_NATIVE_mpn_addsub_n 1
#endif
-static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *,
- mp_ptr *, mp_ptr, mp_ptr, mp_size_t,
- mp_size_t, mp_size_t, int **, mp_ptr, int);
-static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr,
- mp_size_t, mp_size_t, mp_size_t, mp_ptr);
+static mp_limb_t mpn_mul_fft_internal
+__GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, int, int, mp_ptr *, mp_ptr *,
+ mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_size_t, int **, mp_ptr,
+ int));
/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
- We have sqr=0 if for a multiply, sqr=1 for a square.
- There are three generations of this code; we keep the old ones as long as
- some gmp-mparam.h is not updated. */
-
-
-/*****************************************************************************/
-
-#if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3))
+ sqr==0 if for a multiply, sqr==1 for a square.
+ Don't declare it static since it is needed by tuneup.
+*/
+#ifdef MUL_FFT_TABLE2
-#ifndef FFT_TABLE3_SIZE /* When tuning this is defined in gmp-impl.h */
-#if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE)
-#if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE
-#define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE
+#if defined (MUL_FFT_TABLE2_SIZE) && defined (SQR_FFT_TABLE2_SIZE)
+#if MUL_FFT_TABLE2_SIZE > SQR_FFT_TABLE2_SIZE
+#define FFT_TABLE2_SIZE MUL_FFT_TABLE2_SIZE
#else
-#define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE
-#endif
+#define FFT_TABLE2_SIZE SQR_FFT_TABLE2_SIZE
#endif
#endif
-#ifndef FFT_TABLE3_SIZE
-#define FFT_TABLE3_SIZE 200
+#ifndef FFT_TABLE2_SIZE
+#define FFT_TABLE2_SIZE 200
#endif
-FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] =
+/* FIXME: The format of this should change to need less space.
+ Perhaps put n and k in the same 32-bit word, with n shifted-down
+ (k-2) steps, and k using the 4-5 lowest bits. That's possible since
+ n-1 is highly divisible.
+ Alternatively, separate n and k out into separate arrays. */
+struct nk {
+ unsigned int n:27;
+ unsigned int k:5;
+};
+
+static struct nk mpn_fft_table2[2][FFT_TABLE2_SIZE] =
{
- MUL_FFT_TABLE3,
- SQR_FFT_TABLE3
+ MUL_FFT_TABLE2,
+ SQR_FFT_TABLE2
};
int
mpn_fft_best_k (mp_size_t n, int sqr)
{
- FFT_TABLE_ATTRS struct fft_table_nk *fft_tab, *tab;
- mp_size_t tab_n, thres;
+ struct nk *tab;
int last_k;
- fft_tab = mpn_fft_table3[sqr];
- last_k = fft_tab->k;
- for (tab = fft_tab + 1; ; tab++)
+ last_k = 4;
+ for (tab = mpn_fft_table2[sqr] + 1; ; tab++)
{
- tab_n = tab->n;
- thres = tab_n << last_k;
- if (n <= thres)
+ if (n < tab->n)
break;
last_k = tab->k;
}
return last_k;
}
-
-#define MPN_FFT_BEST_READY 1
#endif
-/*****************************************************************************/
-
-#if ! defined (MPN_FFT_BEST_READY)
+#if !defined (MUL_FFT_TABLE2) || TUNE_PROGRAM_BUILD
FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] =
{
MUL_FFT_TABLE,
SQR_FFT_TABLE
};
+#endif
+#if !defined (MUL_FFT_TABLE2)
int
mpn_fft_best_k (mp_size_t n, int sqr)
{
@@ -160,9 +147,6 @@ mpn_fft_best_k (mp_size_t n, int sqr)
}
#endif
-/*****************************************************************************/
-
-
/* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
i.e. smallest multiple of 2^k >= pl.
@@ -196,97 +180,137 @@ mpn_fft_initl (int **l, int k)
}
}
+/* Shift {up, n} of cnt bits to the left, store the complemented result
+ in {rp, n}, and output the shifted bits (not complemented).
+ Same as:
+ cc = mpn_lshift (rp, up, n, cnt);
+ mpn_com_n (rp, rp, n);
+ return cc;
-/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
+ Assumes n >= 1, 1 < cnt < GMP_NUMB_BITS, rp >= up.
+*/
+#ifndef HAVE_NATIVE_mpn_lshiftc
+#undef mpn_lshiftc
+static mp_limb_t
+mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+ mp_limb_t high_limb, low_limb;
+ unsigned int tnc;
+ mp_size_t i;
+ mp_limb_t retval;
+
+ up += n;
+ rp += n;
+
+ tnc = GMP_NUMB_BITS - cnt;
+ low_limb = *--up;
+ retval = low_limb >> tnc;
+ high_limb = (low_limb << cnt);
+
+ for (i = n - 1; i != 0; i--)
+ {
+ low_limb = *--up;
+ *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK;
+ high_limb = low_limb << cnt;
+ }
+ *--rp = (~high_limb) & GMP_NUMB_MASK;
+
+ return retval;
+}
+#endif
+
+/* r <- a*2^e mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
Assumes a is semi-normalized, i.e. a[n] <= 1.
r and a must have n+1 limbs, and not overlap.
*/
static void
-mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n)
+mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, unsigned int d, mp_size_t n)
{
- unsigned int sh;
- mp_size_t m;
+ int sh, negate;
mp_limb_t cc, rd;
sh = d % GMP_NUMB_BITS;
- m = d / GMP_NUMB_BITS;
+ d /= GMP_NUMB_BITS;
+ negate = d >= n;
+ if (negate)
+ d -= n;
- if (m >= n) /* negate */
+ if (negate)
{
- /* r[0..m-1] <-- lshift(a[n-m]..a[n-1], sh)
- r[m..n-1] <-- -lshift(a[0]..a[n-m-1], sh) */
-
- m -= n;
+ /* r[0..d-1] <-- lshift(a[n-d]..a[n-1], sh)
+ r[d..n-1] <-- -lshift(a[0]..a[n-d-1], sh) */
if (sh != 0)
{
/* no out shift below since a[n] <= 1 */
- mpn_lshift (r, a + n - m, m + 1, sh);
- rd = r[m];
- cc = mpn_lshiftc (r + m, a, n - m, sh);
+ mpn_lshift (r, a + n - d, d + 1, sh);
+ rd = r[d];
+ cc = mpn_lshiftc (r + d, a, n - d, sh);
}
else
{
- MPN_COPY (r, a + n - m, m);
+ MPN_COPY (r, a + n - d, d);
rd = a[n];
- mpn_com (r + m, a, n - m);
+ mpn_com_n (r + d, a, n - d);
cc = 0;
}
- /* add cc to r[0], and add rd to r[m] */
+ /* add cc to r[0], and add rd to r[d] */
- /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */
+ /* now add 1 in r[d], subtract 1 in r[n], i.e. add 1 in r[0] */
r[n] = 0;
/* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */
cc++;
mpn_incr_u (r, cc);
- rd++;
+ rd ++;
/* rd might overflow when sh=GMP_NUMB_BITS-1 */
cc = (rd == 0) ? 1 : rd;
- r = r + m + (rd == 0);
+ r = r + d + (rd == 0);
mpn_incr_u (r, cc);
+
+ return;
+ }
+
+ /* if negate=0,
+ r[0..d-1] <-- -lshift(a[n-d]..a[n-1], sh)
+ r[d..n-1] <-- lshift(a[0]..a[n-d-1], sh)
+ */
+ if (sh != 0)
+ {
+ /* no out bits below since a[n] <= 1 */
+ mpn_lshiftc (r, a + n - d, d + 1, sh);
+ rd = ~r[d];
+ /* {r, d+1} = {a+n-d, d+1} << sh */
+ cc = mpn_lshift (r + d, a, n - d, sh); /* {r+d, n-d} = {a, n-d}<<sh */
}
else
{
- /* r[0..m-1] <-- -lshift(a[n-m]..a[n-1], sh)
- r[m..n-1] <-- lshift(a[0]..a[n-m-1], sh) */
- if (sh != 0)
- {
- /* no out bits below since a[n] <= 1 */
- mpn_lshiftc (r, a + n - m, m + 1, sh);
- rd = ~r[m];
- /* {r, m+1} = {a+n-m, m+1} << sh */
- cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<<sh */
- }
- else
- {
- /* r[m] is not used below, but we save a test for m=0 */
- mpn_com (r, a + n - m, m + 1);
- rd = a[n];
- MPN_COPY (r + m, a, n - m);
- cc = 0;
- }
+ /* r[d] is not used below, but we save a test for d=0 */
+ mpn_com_n (r, a + n - d, d + 1);
+ rd = a[n];
+ MPN_COPY (r + d, a, n - d);
+ cc = 0;
+ }
- /* now complement {r, m}, subtract cc from r[0], subtract rd from r[m] */
+ /* now complement {r, d}, subtract cc from r[0], subtract rd from r[d] */
- /* if m=0 we just have r[0]=a[n] << sh */
- if (m != 0)
- {
- /* now add 1 in r[0], subtract 1 in r[m] */
- if (cc-- == 0) /* then add 1 to r[0] */
- cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
- cc = mpn_sub_1 (r, r, m, cc) + 1;
- /* add 1 to cc instead of rd since rd might overflow */
- }
+ /* if d=0 we just have r[0]=a[n] << sh */
+ if (d != 0)
+ {
+ /* now add 1 in r[0], subtract 1 in r[d] */
+ if (cc-- == 0) /* then add 1 to r[0] */
+ cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
+ cc = mpn_sub_1 (r, r, d, cc) + 1;
+ /* add 1 to cc instead of rd since rd might overflow */
+ }
- /* now subtract cc and rd from r[m..n] */
+ /* now subtract cc and rd from r[d..n] */
- r[n] = -mpn_sub_1 (r + m, r + m, n - m, cc);
- r[n] -= mpn_sub_1 (r + m, r + m, n - m, rd);
- if (r[n] & GMP_LIMB_HIGHBIT)
- r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
- }
+ r[n] = -mpn_sub_1 (r + d, r + d, n - d, cc);
+ r[n] -= mpn_sub_1 (r + d, r + d, n - d, rd);
+ if (r[n] & GMP_LIMB_HIGHBIT)
+ r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
}
@@ -294,7 +318,7 @@ mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n)
Assumes a and b are semi-normalized.
*/
static inline void
-mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
+mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, int n)
{
mp_limb_t c, x;
@@ -325,7 +349,7 @@ mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
Assumes a and b are semi-normalized.
*/
static inline void
-mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
+mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, int n)
{
mp_limb_t c, x;
@@ -363,8 +387,8 @@ mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll,
if (K == 2)
{
mp_limb_t cy;
-#if HAVE_NATIVE_mpn_add_n_sub_n
- cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
+#if HAVE_NATIVE_mpn_addsub_n
+ cy = mpn_addsub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
#else
MPN_COPY (tp, Ap[0], n + 1);
mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
@@ -377,14 +401,14 @@ mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll,
}
else
{
- mp_size_t j, K2 = K >> 1;
+ int j;
int *lk = *ll;
- mpn_fft_fft (Ap, K2, ll-1, 2 * omega, n, inc * 2, tp);
- mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp);
+ mpn_fft_fft (Ap, K >> 1, ll-1, 2 * omega, n, inc * 2, tp);
+ mpn_fft_fft (Ap+inc, K >> 1, ll-1, 2 * omega, n, inc * 2, tp);
/* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
- for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc)
+ for (j = 0; j < (K >> 1); j++, lk += 2, Ap += 2 * inc)
{
/* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega)
Ap[0] <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */
@@ -429,7 +453,7 @@ mpn_fft_normalize (mp_ptr ap, mp_size_t n)
/* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */
static void
-mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
+mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, int K)
{
int i;
int sqr = (ap == bp);
@@ -439,13 +463,12 @@ mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
{
- mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
- int k;
- int **fft_l, *tmp;
+ int k, K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
+ int **_fft_l;
mp_ptr *Ap, *Bp, A, B, T;
k = mpn_fft_best_k (n, sqr);
- K2 = (mp_size_t) 1 << k;
+ K2 = 1 << k;
ASSERT_ALWAYS((n & (K2 - 1)) == 0);
maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS;
M2 = n * GMP_NUMB_BITS >> k;
@@ -457,10 +480,10 @@ mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
/* we should ensure that nprime2 is a multiple of the next K */
if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
{
- mp_size_t K3;
+ unsigned long K3;
for (;;)
{
- K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr);
+ K3 = 1L << mpn_fft_best_k (nprime2, sqr);
if ((nprime2 & (K3 - 1)) == 0)
break;
nprime2 = (nprime2 + K3 - 1) & -K3;
@@ -472,53 +495,41 @@ mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
Mp2 = Nprime2 >> k;
- Ap = TMP_BALLOC_MP_PTRS (K2);
- Bp = TMP_BALLOC_MP_PTRS (K2);
- A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k);
- T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1));
- B = A + ((nprime2 + 1) << k);
- fft_l = TMP_BALLOC_TYPE (k + 1, int *);
- tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
+ Ap = TMP_ALLOC_MP_PTRS (K2);
+ Bp = TMP_ALLOC_MP_PTRS (K2);
+ A = TMP_ALLOC_LIMBS (2 * K2 * (nprime2 + 1));
+ T = TMP_ALLOC_LIMBS (2 * (nprime2 + 1));
+ B = A + K2 * (nprime2 + 1);
+ _fft_l = TMP_ALLOC_TYPE (k + 1, int *);
for (i = 0; i <= k; i++)
- {
- fft_l[i] = tmp;
- tmp += (mp_size_t) 1 << i;
- }
+ _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+ mpn_fft_initl (_fft_l, k);
- mpn_fft_initl (fft_l, k);
-
- TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n,
+ TRACE (printf ("recurse: %ldx%ld limbs -> %d times %dx%d (%1.2f)\n", n,
n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
for (i = 0; i < K; i++, ap++, bp++)
{
- mp_limb_t cy;
mpn_fft_normalize (*ap, n);
if (!sqr)
mpn_fft_normalize (*bp, n);
-
- mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
- if (!sqr)
- mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
-
- cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
- l, Mp2, fft_l, T, sqr);
- (*ap)[n] = cy;
+ mpn_mul_fft_internal (*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+ l, Mp2, _fft_l, T, 1);
}
}
else
{
mp_ptr a, b, tp, tpn;
mp_limb_t cc;
- mp_size_t n2 = 2 * n;
- tp = TMP_BALLOC_LIMBS (n2);
+ int n2 = 2 * n;
+ tp = TMP_ALLOC_LIMBS (n2);
tpn = tp + n;
- TRACE (printf (" mpn_mul_n %ld of %ld limbs\n", K, n));
+ TRACE (printf (" mpn_mul_n %d of %ld limbs\n", K, n));
for (i = 0; i < K; i++)
{
a = *ap++;
b = *bp++;
if (sqr)
- mpn_sqr (tp, a, n);
+ mpn_sqr_n (tp, a, n);
else
mpn_mul_n (tp, b, a, n);
if (a[n] != 0)
@@ -546,13 +557,13 @@ mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
This condition is also fulfilled at exit.
*/
static void
-mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp)
+mpn_fft_fftinv (mp_ptr *Ap, int K, mp_size_t omega, mp_size_t n, mp_ptr tp)
{
if (K == 2)
{
mp_limb_t cy;
-#if HAVE_NATIVE_mpn_add_n_sub_n
- cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
+#if HAVE_NATIVE_mpn_addsub_n
+ cy = mpn_addsub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
#else
MPN_COPY (tp, Ap[0], n + 1);
mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
@@ -565,7 +576,7 @@ mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp
}
else
{
- mp_size_t j, K2 = K >> 1;
+ int j, K2 = K >> 1;
mpn_fft_fftinv (Ap, K2, 2 * omega, n, tp);
mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp);
@@ -583,14 +594,15 @@ mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp
}
-/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
+/* A <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
static void
-mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n)
+mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, int k, mp_size_t n)
{
- mp_bitcnt_t i;
+ int i;
ASSERT (r != a);
- i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k;
+ i = 2 * n * GMP_NUMB_BITS;
+ i = (i - k) % i; /* FIXME: This % looks superfluous */
mpn_fft_mul_2exp_modF (r, a, i, n);
/* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
/* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
@@ -602,11 +614,13 @@ mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n)
Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1,
then {rp,n}=0.
*/
-static mp_size_t
+static int
mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an)
{
- mp_size_t l, m, rpn;
+ mp_size_t l;
+ long int m;
mp_limb_t cc;
+ int rpn;
ASSERT ((n <= an) && (an <= 3 * n));
m = an - 2 * n;
@@ -640,11 +654,10 @@ mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an)
We must have nl <= 2*K*l.
*/
static void
-mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime,
- mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp,
- mp_ptr T)
+mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, int K, int nprime, mp_srcptr n,
+ mp_size_t nl, int l, int Mp, mp_ptr T)
{
- mp_size_t i, j;
+ int i, j;
mp_ptr tmp;
mp_size_t Kl = K * l;
TMP_DECL;
@@ -655,7 +668,7 @@ mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime,
mp_size_t dif = nl - Kl;
mp_limb_signed_t cy;
- tmp = TMP_BALLOC_LIMBS(Kl + 1);
+ tmp = TMP_ALLOC_LIMBS(Kl + 1);
if (dif > Kl)
{
@@ -717,30 +730,48 @@ mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime,
}
/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
- op is pl limbs, its high bit is returned.
+ n and m have respectively nl and ml limbs
+ op must have space for pl+1 limbs if rec=1 (and pl limbs if rec=0).
One must have pl = mpn_fft_next_size (pl, k).
T must have space for 2 * (nprime + 1) limbs.
+
+ If rec=0, then store only the pl low bits of the result, and return
+ the out carry.
*/
static mp_limb_t
-mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
- mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
+mpn_mul_fft_internal (mp_ptr op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+ int k, int K,
+ mp_ptr *Ap, mp_ptr *Bp,
+ mp_ptr A, mp_ptr B,
mp_size_t nprime, mp_size_t l, mp_size_t Mp,
- int **fft_l, mp_ptr T, int sqr)
+ int **_fft_l,
+ mp_ptr T, int rec)
{
- mp_size_t K, i, pla, lo, sh, j;
+ int i, sqr, pla, lo, sh, j;
mp_ptr p;
mp_limb_t cc;
- K = (mp_size_t) 1 << k;
+ sqr = n == m;
+
+ TRACE (printf ("pl=%ld k=%d K=%d np=%ld l=%ld Mp=%ld rec=%d sqr=%d\n",
+ pl,k,K,nprime,l,Mp,rec,sqr));
+
+ /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+ if (rec)
+ {
+ mpn_mul_fft_decompose (A, Ap, K, nprime, n, K * l + 1, l, Mp, T);
+ if (!sqr)
+ mpn_mul_fft_decompose (B, Bp, K, nprime, m, K * l + 1, l, Mp, T);
+ }
/* direct fft's */
- mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
+ mpn_fft_fft (Ap, K, _fft_l + k, 2 * Mp, nprime, 1, T);
if (!sqr)
- mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
+ mpn_fft_fft (Bp, K, _fft_l + k, 2 * Mp, nprime, 1, T);
/* term to term multiplications */
- mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
+ mpn_fft_mul_modF_K (Ap, (sqr) ? Ap : Bp, nprime, K);
/* inverse fft's */
mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
@@ -804,14 +835,18 @@ mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
/* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
< K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
< K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
- return mpn_fft_norm_modF (op, pl, p, pla);
+ i = mpn_fft_norm_modF (op, pl, p, pla);
+ if (rec) /* store the carry out */
+ op[pl] = i;
+
+ return i;
}
/* return the lcm of a and 2^k */
-static mp_bitcnt_t
-mpn_mul_fft_lcm (mp_bitcnt_t a, int k)
+static unsigned long int
+mpn_mul_fft_lcm (unsigned long int a, unsigned int k)
{
- mp_bitcnt_t l = k;
+ unsigned long int l = k;
while (a % 2 == 0 && k > 0)
{
@@ -828,11 +863,10 @@ mpn_mul_fft (mp_ptr op, mp_size_t pl,
mp_srcptr m, mp_size_t ml,
int k)
{
- int i;
- mp_size_t K, maxLK;
+ int K, maxLK, i;
mp_size_t N, Nprime, nprime, M, Mp, l;
mp_ptr *Ap, *Bp, A, T, B;
- int **fft_l, *tmp;
+ int **_fft_l;
int sqr = (n == m && nl == ml);
mp_limb_t h;
TMP_DECL;
@@ -842,72 +876,63 @@ mpn_mul_fft (mp_ptr op, mp_size_t pl,
TMP_MARK;
N = pl * GMP_NUMB_BITS;
- fft_l = TMP_BALLOC_TYPE (k + 1, int *);
- tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
+ _fft_l = TMP_ALLOC_TYPE (k + 1, int *);
for (i = 0; i <= k; i++)
- {
- fft_l[i] = tmp;
- tmp += (mp_size_t) 1 << i;
- }
-
- mpn_fft_initl (fft_l, k);
- K = (mp_size_t) 1 << k;
+ _fft_l[i] = TMP_ALLOC_TYPE (1 << i, int);
+ mpn_fft_initl (_fft_l, k);
+ K = 1 << k;
M = N >> k; /* N = 2^k M */
l = 1 + (M - 1) / GMP_NUMB_BITS;
- maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */
+ maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */
Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
/* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */
nprime = Nprime / GMP_NUMB_BITS;
- TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n",
+ TRACE (printf ("N=%ld K=%d, M=%ld, l=%ld, maxLK=%d, Np=%ld, np=%ld\n",
N, K, M, l, maxLK, Nprime, nprime));
/* we should ensure that recursively, nprime is a multiple of the next K */
if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
{
- mp_size_t K2;
+ unsigned long K2;
for (;;)
{
- K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr);
+ K2 = 1L << mpn_fft_best_k (nprime, sqr);
if ((nprime & (K2 - 1)) == 0)
break;
nprime = (nprime + K2 - 1) & -K2;
Nprime = nprime * GMP_LIMB_BITS;
/* warning: since nprime changed, K2 may change too! */
}
- TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime));
+ TRACE (printf ("new maxLK=%d, Np=%ld, np=%ld\n", maxLK, Nprime, nprime));
}
ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */
- T = TMP_BALLOC_LIMBS (2 * (nprime + 1));
+ T = TMP_ALLOC_LIMBS (2 * (nprime + 1));
Mp = Nprime >> k;
- TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n",
+ TRACE (printf ("%ldx%ld limbs -> %d times %ldx%ld limbs (%1.2f)\n",
pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K);
printf (" temp space %ld\n", 2 * K * (nprime + 1)));
- A = TMP_BALLOC_LIMBS (K * (nprime + 1));
- Ap = TMP_BALLOC_MP_PTRS (K);
+ A = __GMP_ALLOCATE_FUNC_LIMBS (2 * K * (nprime + 1));
+ B = A + K * (nprime + 1);
+ Ap = TMP_ALLOC_MP_PTRS (K);
+ Bp = TMP_ALLOC_MP_PTRS (K);
+
+ /* special decomposition for main call */
+ /* nl is the number of significant limbs in n */
mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T);
- if (sqr)
- {
- mp_size_t pla;
- pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
- B = TMP_BALLOC_LIMBS (pla);
- Bp = TMP_BALLOC_MP_PTRS (K);
- }
- else
- {
- B = TMP_BALLOC_LIMBS (K * (nprime + 1));
- Bp = TMP_BALLOC_MP_PTRS (K);
- mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T);
- }
- h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr);
+ if (n != m)
+ mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T);
+
+ h = mpn_mul_fft_internal (op, n, m, pl, k, K, Ap, Bp, A, B, nprime, l, Mp, _fft_l, T, 0);
TMP_FREE;
+ __GMP_FREE_FUNC_LIMBS (A, 2 * K * (nprime + 1));
+
return h;
}
-#if WANT_OLD_FFT_FULL
/* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */
void
mpn_mul_fft_full (mp_ptr op,
@@ -916,9 +941,9 @@ mpn_mul_fft_full (mp_ptr op,
{
mp_ptr pad_op;
mp_size_t pl, pl2, pl3, l;
- mp_size_t cc, c2, oldcc;
int k2, k3;
int sqr = (n == m && nl == ml);
+ int cc, c2, oldcc;
pl = nl + ml; /* total number of limbs of the result */
@@ -935,7 +960,7 @@ mpn_mul_fft_full (mp_ptr op,
pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */
do
{
- pl2++;
+ pl2 ++;
k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */
pl2 = mpn_fft_next_size (pl2, k2);
pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4,
@@ -949,23 +974,23 @@ mpn_mul_fft_full (mp_ptr op,
ASSERT_ALWAYS(pl3 <= pl);
cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3); /* mu */
- ASSERT(cc == 0);
+ ASSERT_ALWAYS(cc == 0);
pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2);
cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */
cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2); /* lambda - low(mu) */
/* 0 <= cc <= 1 */
- ASSERT(0 <= cc && cc <= 1);
+ ASSERT_ALWAYS(0 <= cc && cc <= 1);
l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */
c2 = mpn_add_n (pad_op, pad_op, op + pl2, l);
cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc;
- ASSERT(-1 <= cc && cc <= 1);
+ ASSERT_ALWAYS(-1 <= cc && cc <= 1);
if (cc < 0)
cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
- ASSERT(0 <= cc && cc <= 1);
+ ASSERT_ALWAYS(0 <= cc && cc <= 1);
/* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */
oldcc = cc;
-#if HAVE_NATIVE_mpn_add_n_sub_n
- c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l);
+#if HAVE_NATIVE_mpn_addsub_n
+ c2 = mpn_addsub_n (pad_op + l, pad_op, pad_op, pad_op + l, l);
/* c2 & 1 is the borrow, c2 & 2 is the carry */
cc += c2 >> 1; /* carry out from high <- low + high */
c2 = c2 & 1; /* borrow out from low <- low - high */
@@ -975,7 +1000,7 @@ mpn_mul_fft_full (mp_ptr op,
TMP_DECL;
TMP_MARK;
- tmp = TMP_BALLOC_LIMBS (l);
+ tmp = TMP_ALLOC_LIMBS (l);
MPN_COPY (tmp, pad_op, l);
c2 = mpn_sub_n (pad_op, pad_op, pad_op + l, l);
cc += mpn_add_n (pad_op + l, tmp, pad_op + l, l);
@@ -1011,4 +1036,3 @@ mpn_mul_fft_full (mp_ptr op,
/* since the final result has at most pl limbs, no carry out below */
mpn_add_1 (op + pl2, op + pl2, pl - pl2, (mp_limb_t) c2);
}
-#endif
diff --git a/gmp/mpn/generic/mul_n.c b/gmp/mpn/generic/mul_n.c
index 5df8b16fa0..4aa25f9b58 100644
--- a/gmp/mpn/generic/mul_n.c
+++ b/gmp/mpn/generic/mul_n.c
@@ -1,38 +1,695 @@
-/* mpn_mul_n -- multiply natural numbers.
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
-Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software
-Foundation, Inc.
+ THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) ARE
+ INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH
+ DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE
+ OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+{
+ mp_limb_t w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+ mp_size_t i;
+ int sign;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if ((n & 1) != 0)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ sign = 0;
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3 + i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ sign = ~0;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ w = b[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p + n3, b, b + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n3 + i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n3;
+ y = b;
+ sign = ~sign;
+ }
+ else
+ {
+ x = b;
+ y = b + n3;
+ }
+ mpn_sub_n (p + n3, x, y, n2);
+ }
+ p[n] = w;
+
+ n1 = n + 1;
+ if (n2 < MUL_KARATSUBA_THRESHOLD)
+ {
+ if (n3 < MUL_KARATSUBA_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n3, p + n3, n3);
+ mpn_mul_basecase (p, a, n3, b, n3);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ }
+ mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+ mpn_kara_mul_n (p, a, b, n3, ws + n1);
+ mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+ }
+
+ if (sign)
+ mpn_add_n (ws, p, ws, n1);
+ else
+ mpn_sub_n (ws, p, ws, n1);
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1))
+ {
+ mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
+ ws[nm1] = x;
+ if (x == 0)
+ ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mpn_incr_u (p + n1 + n3, 1);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2 + i];
+ }
+ while (w0 == w1 && i != 0);
+ sign = 0;
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ sign = ~0;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ i = n2;
+ do
+ {
+ --i;
+ w0 = b[i];
+ w1 = b[n2 + i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = b + n2;
+ y = b;
+ sign = ~sign;
+ }
+ else
+ {
+ x = b;
+ y = b + n2;
+ }
+ mpn_sub_n (p + n2, x, y, n2);
+
+ /* Pointwise products. */
+ if (n2 < MUL_KARATSUBA_THRESHOLD)
+ {
+ mpn_mul_basecase (ws, p, n2, p + n2, n2);
+ mpn_mul_basecase (p, a, n2, b, n2);
+ mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+ }
+ else
+ {
+ mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+ mpn_kara_mul_n (p, a, b, n2, ws + n);
+ mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ if (sign)
+ w = mpn_add_n (ws, p, ws, n);
+ else
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
+ }
+}
+
+void
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+{
+ mp_limb_t w, w0, w1;
+ mp_size_t n2;
+ mp_srcptr x, y;
+ mp_size_t i;
+
+ n2 = n >> 1;
+ ASSERT (n2 > 0);
+
+ if ((n & 1) != 0)
+ {
+ /* Odd length. */
+ mp_size_t n1, n3, nm1;
+
+ n3 = n - n2;
+
+ w = a[n2];
+ if (w != 0)
+ w -= mpn_sub_n (p, a, a + n3, n2);
+ else
+ {
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n3 + i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n3;
+ y = a;
+ }
+ else
+ {
+ x = a;
+ y = a + n3;
+ }
+ mpn_sub_n (p, x, y, n2);
+ }
+ p[n2] = w;
+
+ n1 = n + 1;
+
+ /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
+ could be combined. But that's not important, since the tests will
+ take a miniscule amount of time compared to the function calls. */
+ if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
+ {
+ mpn_mul_basecase (ws, p, n3, p, n3);
+ mpn_mul_basecase (p, a, n3, a, n3);
+ }
+ else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
+ {
+ mpn_sqr_basecase (ws, p, n3);
+ mpn_sqr_basecase (p, a, n3);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */
+ mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */
+ }
+ if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
+ mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
+ else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
+ mpn_sqr_basecase (p + n1, a + n3, n2);
+ else
+ mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */
+
+ /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
+ borrow from mpn_sub_n. If it occurs then it'll be cancelled by a
+ carry from ws[n]. Further, since 2xy fits in n1 limbs there won't
+ be any carry out of ws[n] other than cancelling that borrow. */
+
+ mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */
+
+ nm1 = n - 1;
+ if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */
+ {
+ mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
+ ws[nm1] = x;
+ if (x == 0)
+ ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
+ }
+ if (mpn_add_n (p + n3, p + n3, ws, n1))
+ {
+ mpn_incr_u (p + n1 + n3, 1);
+ }
+ }
+ else
+ {
+ /* Even length. */
+ i = n2;
+ do
+ {
+ --i;
+ w0 = a[i];
+ w1 = a[n2 + i];
+ }
+ while (w0 == w1 && i != 0);
+ if (w0 < w1)
+ {
+ x = a + n2;
+ y = a;
+ }
+ else
+ {
+ x = a;
+ y = a + n2;
+ }
+ mpn_sub_n (p, x, y, n2);
+
+ /* Pointwise products. */
+ if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
+ {
+ mpn_mul_basecase (ws, p, n2, p, n2);
+ mpn_mul_basecase (p, a, n2, a, n2);
+ mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
+ }
+ else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
+ {
+ mpn_sqr_basecase (ws, p, n2);
+ mpn_sqr_basecase (p, a, n2);
+ mpn_sqr_basecase (p + n, a + n2, n2);
+ }
+ else
+ {
+ mpn_kara_sqr_n (ws, p, n2, ws + n);
+ mpn_kara_sqr_n (p, a, n2, ws + n);
+ mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+ }
+
+ /* Interpolate. */
+ w = -mpn_sub_n (ws, p, ws, n);
+ w += mpn_add_n (ws, p + n, ws, n);
+ w += mpn_add_n (p + n2, p + n2, ws, n);
+ MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
+ }
+}
+
+/******************************************************************************
+ * *
+ * Toom 3-way multiplication and squaring *
+ * *
+ *****************************************************************************/
+
+/* Starts from:
+ {v0,2k} (stored in {c,2k})
+ {vm1,2k+1} (which sign is sa, and absolute value is stored in {vm1,2k+1})
+ {v1,2k+1} (stored in {c+2k,2k+1})
+ {v2,2k+1}
+ {vinf,twor} (stored in {c+4k,twor}, except the first limb, saved in vinf0)
+
+ ws is temporary space, and should have at least twor limbs.
+
+ put in {c, 2n} where n = 2k+twor the value of {v0,2k} (already in place)
+ + B^k * {tm1, 2k+1}
+ + B^(2k) * {t1, 2k+1}
+ + B^(3k) * {t2, 2k+1}
+ + B^(4k) * {vinf,twor} (high twor-1 limbs already in place)
+ where {t1, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1}- 2*{v0,2k})/2-*{vinf,twor}
+ {t2, 2k+1} = (3*({v1,2k+1}-{v0,2k})-sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,twor}
+ {tm1,2k+1} = ({v1,2k+1}-sa*{vm1,2k+1}/2-{t2,2k+1}
+
+ Exact sequence described in a comment in mpn_toom3_mul_n.
+ mpn_toom3_mul_n() and mpn_toom3_sqr_n() implement steps 1-2.
+ mpn_toom_interpolate_5pts() implements steps 3-4.
+
+ Reference: What About Toom-Cook Matrices Optimality? Marco Bodrato
+ and Alberto Zanoni, October 19, 2006, http://bodrato.it/papers/#CIVV2006
+
+ ************* saved note ****************
+ Think about:
+
+ The evaluated point a-b+c stands a good chance of having a zero carry
+ limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly.
+ Perhaps this could be tested and stripped. Doing so before recursing
+ would be better than stripping at the start of mpn_toom3_mul_n/sqr_n,
+ since then the recursion could be based on the new size. Although in
+ truth the kara vs toom3 crossover is never so exact that one limb either
+ way makes a difference.
+
+ A small value like 1 or 2 for the carry could perhaps also be handled
+ with an add_n or addlsh1_n. Would that be faster than an extra limb on a
+ (recursed) multiply/square?
+*/
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+ do { \
+ if (MUL_TOOM3_THRESHOLD / 3 < MUL_KARATSUBA_THRESHOLD \
+ && BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) \
+ mpn_kara_mul_n (p, a, b, n, ws); \
+ else \
+ mpn_toom3_mul_n (p, a, b, n, ws); \
+ } while (0)
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+ do { \
+ if (SQR_TOOM3_THRESHOLD / 3 < SQR_BASECASE_THRESHOLD \
+ && BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \
+ mpn_mul_basecase (p, a, n, a, n); \
+ else if (SQR_TOOM3_THRESHOLD / 3 < SQR_KARATSUBA_THRESHOLD \
+ && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
+ mpn_sqr_basecase (p, a, n); \
+ else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
+ mpn_kara_sqr_n (p, a, n, ws); \
+ else \
+ mpn_toom3_sqr_n (p, a, n, ws); \
+ } while (0)
+
+/* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD,
+ and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3).
+
+ Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1).
+ Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1).
+
+ With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1).
+ Since THRESHOLD >= 17, we have n/(k+1) >= 19/8
+ thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n).
+*/
+
+void
+mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
+{
+ mp_size_t k, k1, kk1, r, twok, twor;
+ mp_limb_t cy, cc, saved, vinf0;
+ mp_ptr trec;
+ int sa, sb;
+ mp_ptr c1, c2, c3, c4, c5;
+
+ ASSERT(GMP_NUMB_BITS >= 6);
+ ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
+
+ /*
+ The algorithm is the following:
+
+ 0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k
+ 1. split a and b in three parts each a0, a1, a2 and b0, b1, b2
+ with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs
+ 2. Evaluation: vm1 may be negative, the other can not.
+ v0 <- a0*b0
+ v1 <- (a0+a1+a2)*(b0+b1+b2)
+ v2 <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2)
+ vm1 <- (a0-a1+a2)*(b0-b1+b2)
+ vinf <- a2*b2
+ 3. Interpolation: every result is positive, all divisions are exact
+ t2 <- (v2 - vm1)/3
+ tm1 <- (v1 - vm1)/2
+ t1 <- (v1 - v0)
+ t2 <- (t2 - t1)/2
+ t1 <- (t1 - tm1 - vinf)
+ t2 <- (t2 - 2*vinf)
+ tm1 <- (tm1 - t2)
+ 4. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where
+ c0 <- v0
+ c1 <- tm1
+ c2 <- t1
+ c3 <- t2
+ c4 <- vinf
+ */
+
+ k = (n + 2) / 3; /* ceil(n/3) */
+ twok = 2 * k;
+ k1 = k + 1;
+ kk1 = k + k1;
+ r = n - twok; /* last chunk */
+ twor = 2 * r;
+
+ c1 = c + k;
+ c2 = c1 + k;
+ c3 = c2 + k;
+ c4 = c3 + k;
+ c5 = c4 + k;
+
+ trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */
+
+ /* put a0+a2 in {c, k+1}, and b0+b2 in {c+4k+2, k+1};
+ put a0+a1+a2 in {t, k+1} and b0+b1+b2 in {t+k+1,k+1}
+ [????requires 5k+3 <= 2n, ie. n >= 9] */
+ cy = mpn_add_n (c, a, a + twok, r);
+ cc = mpn_add_n (c4 + 2, b, b + twok, r);
+ if (r < k)
+ {
+ __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
+ __GMPN_ADD_1 (cc, c4 + 2 + r, b + r, k - r, cc);
+ }
+
+ /* Put in {t, k+1} the sum
+ * (a_0+a_2) - stored in {c, k+1} -
+ * +
+ * a_1 - stored in {a+k, k} */
+ t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k);
+ /* ^ ^
+ * carry of a_0 + a_2 carry of (a_0+a_2) + a_1
+
+ */
+
+ /* Put in {t+k+1, k+1} the sum of the two values
+ * (b_0+b_2) - stored in {c1+1, k+1} -
+ * +
+ * b_1 - stored in {b+k, k} */
+ t[kk1] = (c5[3] = cc) + mpn_add_n (t + k1, c4 + 2, b + k, k);
+ /* ^ ^
+ * carry of b_0 + b_2 carry of (b_0+b_2) + b_1 */
+
+#define v2 (t+2*k+1)
+
+ /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1};
+ since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
+ TOOM3_MUL_REC (c2, t, t + k1, k1, trec);
+
+ /* c c2 c4 t
+ {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v1 */
+
+ /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */
+ /* (They're already there, actually) */
+
+ /* sa = sign(a0-a1+a2) */
+ sa = (cy != 0) ? 1 : mpn_cmp (c, a + k, k);
+ c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k)
+ : mpn_sub_n (c, a + k, c, k);
+
+ sb = (cc != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k);
+ c5[2] = (sb >= 0) ? cc - mpn_sub_n (c4 + 2, c4 + 2, b + k, k)
+ : mpn_sub_n (c4 + 2, b + k, c4 + 2, k);
+ sa *= sb; /* sign of vm1 */
+
+ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1};
+ since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
+ TOOM3_MUL_REC (t, c, c4 + 2, k1, trec);
+
+ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v1 vm1
+ */
+
+ /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1}
+ [requires 5k+3 <= 2n, i.e. n >= 17] */
+#ifdef HAVE_NATIVE_mpn_addlsh1_n
+ c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
+ c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
+ if (r < k)
+ {
+ __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
+ __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]);
+ }
+ c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
+ c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
+#else
+ c[r] = mpn_lshift (c, a + twok, r, 1);
+ c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1);
+ if (r < k)
+ {
+ MPN_ZERO(c + r + 1, k - r);
+ MPN_ZERO(c4 + r + 3, k - r);
+ }
+ c1[0] += mpn_add_n (c, c, a + k, k);
+ c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
+ mpn_lshift (c, c, k1, 1);
+ mpn_lshift (c4 + 2, c4 + 2, k1, 1);
+ c1[0] += mpn_add_n (c, c, a, k);
+ c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
+#endif
+
+ /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
+ v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
+ TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);
+
+ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v1 vm1 v2
+ */
+
+ /* compute v0 := a0*b0 in {c, 2k} */
+ TOOM3_MUL_REC (c, a, b, k, trec);
+
+ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v0 v1 vm1 v2 */
+
+ /* compute vinf := a2*b2 in {t+4k+2, 2r}: in {c4, 2r} */
+
+ saved = c4[0]; /* Remember v1's highest byte (will be overwritten). */
+ TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); /* Overwrites c4[0]. */
+ vinf0 = c4[0]; /* Remember vinf's lowest byte (will be overwritten).*/
+ c4[0] = saved; /* Overwriting. Now v1 value is correct. */
+
+ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v0 v1 vinf[1..] vm1 v2 */
+
+ mpn_toom_interpolate_5pts (c, v2, t, k, 2*r, sa, vinf0, trec);
+
+#undef v2
+}
+
+void
+mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
+{
+ mp_size_t k, k1, kk1, r, twok, twor;
+ mp_limb_t cy, saved, vinf0;
+ mp_ptr trec;
+ int sa;
+ mp_ptr c1, c2, c3, c4;
+
+ ASSERT(GMP_NUMB_BITS >= 6);
+ ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
+
+ /* the algorithm is the same as mpn_toom3_mul_n, with b=a */
+
+ k = (n + 2) / 3; /* ceil(n/3) */
+ twok = 2 * k;
+ k1 = k + 1;
+ kk1 = k + k1;
+ r = n - twok; /* last chunk */
+ twor = 2 * r;
+
+ c1 = c + k;
+ c2 = c1 + k;
+ c3 = c2 + k;
+ c4 = c3 + k;
+
+ trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */
+
+ cy = mpn_add_n (c, a, a + twok, r);
+ if (r < k)
+ __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
+ t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k);
+
+#define v2 (t+2*k+1)
+
+ TOOM3_SQR_REC (c2, t, k1, trec);
+
+ sa = (cy != 0) ? 1 : mpn_cmp (c, a + k, k);
+ c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k)
+ : mpn_sub_n (c, a + k, c, k);
+
+ TOOM3_SQR_REC (t, c, k1, trec);
+
+#ifdef HAVE_NATIVE_mpn_addlsh1_n
+ c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
+ if (r < k)
+ __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
+ c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
+#else
+ c[r] = mpn_lshift (c, a + twok, r, 1);
+ if (r < k)
+ MPN_ZERO(c + r + 1, k - r);
+ c1[0] += mpn_add_n (c, c, a + k, k);
+ mpn_lshift (c, c, k1, 1);
+ c1[0] += mpn_add_n (c, c, a, k);
+#endif
+
+ TOOM3_SQR_REC (v2, c, k1, trec);
+
+ TOOM3_SQR_REC (c, a, k, trec);
+
+ saved = c4[0];
+ TOOM3_SQR_REC (c4, a + twok, r, trec);
+ vinf0 = c4[0];
+ c4[0] = saved;
+
+ mpn_toom_interpolate_5pts (c, v2, t, k, 2*r, 1, vinf0, trec);
+
+#undef v2
+}
+
void
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
{
@@ -40,28 +697,31 @@ mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
- if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+ if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
{
mpn_mul_basecase (p, a, n, b, n);
}
- else if (BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))
+ else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
{
/* Allocate workspace of fixed size on stack: fast! */
- mp_limb_t ws[mpn_toom22_mul_itch (MUL_TOOM33_THRESHOLD_LIMIT-1,
- MUL_TOOM33_THRESHOLD_LIMIT-1)];
- ASSERT (MUL_TOOM33_THRESHOLD <= MUL_TOOM33_THRESHOLD_LIMIT);
- mpn_toom22_mul (p, a, n, b, n, ws);
+ mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
+ ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
+ mpn_kara_mul_n (p, a, b, n, ws);
}
else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))
{
mp_ptr ws;
TMP_SDECL;
TMP_SMARK;
- ws = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (n, n));
- mpn_toom33_mul (p, a, n, b, n, ws);
+ ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
+ mpn_toom3_mul_n (p, a, b, n, ws);
TMP_SFREE;
}
- else if (BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD))
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD))
+#else
+ else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N))
+#endif
{
mp_ptr ws;
TMP_SDECL;
@@ -70,28 +730,91 @@ mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
mpn_toom44_mul (p, a, n, b, n, ws);
TMP_SFREE;
}
- else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD))
+ else
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ {
+ /* The current FFT code allocates its own space. That should probably
+ change. */
+ mpn_mul_fft_full (p, a, n, b, n);
+ }
+#else
+ {
+ /* Toom4 for large operands. */
+ mp_ptr ws;
+ TMP_DECL;
+ TMP_MARK;
+ ws = TMP_BALLOC_LIMBS (mpn_toom44_mul_itch (n, n));
+ mpn_toom44_mul (p, a, n, b, n, ws);
+ TMP_FREE;
+ }
+#endif
+}
+
+void
+mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
+{
+ ASSERT (n >= 1);
+ ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
+
+#if 0
+ /* FIXME: Can this be removed? */
+ if (n == 0)
+ return;
+#endif
+
+ if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+ { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
+ mpn_mul_basecase (p, a, n, a, n);
+ }
+ else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))
+ {
+ mpn_sqr_basecase (p, a, n);
+ }
+ else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
+ {
+ /* Allocate workspace of fixed size on stack: fast! */
+ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)];
+ ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
+ mpn_kara_sqr_n (p, a, n, ws);
+ }
+ else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
{
mp_ptr ws;
TMP_SDECL;
TMP_SMARK;
- ws = TMP_SALLOC_LIMBS (mpn_toom6_mul_n_itch (n));
- mpn_toom6h_mul (p, a, n, b, n, ws);
+ ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
+ mpn_toom3_sqr_n (p, a, n, ws);
TMP_SFREE;
}
- else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD))
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+ else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
+#else
+ else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N))
+#endif
{
mp_ptr ws;
- TMP_DECL;
- TMP_MARK;
- ws = TMP_ALLOC_LIMBS (mpn_toom8_mul_n_itch (n));
- mpn_toom8h_mul (p, a, n, b, n, ws);
- TMP_FREE;
+ TMP_SDECL;
+ TMP_SMARK;
+ ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n));
+ mpn_toom4_sqr (p, a, n, ws);
+ TMP_SFREE;
}
else
+#if WANT_FFT || TUNE_PROGRAM_BUILD
{
/* The current FFT code allocates its own space. That should probably
change. */
- mpn_fft_mul (p, a, n, b, n);
+ mpn_mul_fft_full (p, a, n, a, n);
+ }
+#else
+ {
+ /* Toom4 for large operands. */
+ mp_ptr ws;
+ TMP_DECL;
+ TMP_MARK;
+ ws = TMP_BALLOC_LIMBS (mpn_toom4_sqr_itch (n));
+ mpn_toom4_sqr (p, a, n, ws);
+ TMP_FREE;
}
+#endif
}
diff --git a/gmp/mpn/generic/mullo_basecase.c b/gmp/mpn/generic/mullo_basecase.c
deleted file mode 100644
index 2120f44c3d..0000000000
--- a/gmp/mpn/generic/mullo_basecase.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* mpn_mullo_basecase -- Internal routine to multiply two natural
- numbers of length m and n and return the low part.
-
- THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-
-
-Copyright (C) 2000, 2002, 2004 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- FIXME: Should use mpn_addmul_2 (and higher).
-*/
-
-void
-mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
-{
- mp_size_t i;
-
- mpn_mul_1 (rp, up, n, vp[0]);
-
- for (i = 1; i < n; i++)
- mpn_addmul_1 (rp + i, up, n - i, vp[i]);
-}
diff --git a/gmp/mpn/generic/mullo_n.c b/gmp/mpn/generic/mullo_n.c
deleted file mode 100644
index dad75ee8f7..0000000000
--- a/gmp/mpn/generic/mullo_n.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
- of their products.
-
- Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
- THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
- FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
- THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#ifndef MULLO_BASECASE_THRESHOLD
-#define MULLO_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */
-#endif
-
-#ifndef MULLO_DC_THRESHOLD
-#define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
-#endif
-
-#ifndef MULLO_MUL_N_THRESHOLD
-#define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
-#endif
-
-#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
-#define MAYBE_range_basecase 1
-#define MAYBE_range_toom22 1
-#else
-#define MAYBE_range_basecase \
- ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
-#define MAYBE_range_toom22 \
- ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
-#endif
-
-/* THINK: The DC strategy uses different constants in different Toom's
- ranges. Something smoother?
-*/
-
-/*
- Compute the least significant half of the product {xy,n}*{yp,n}, or
- formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
-
- Above the given threshold, the Divide and Conquer strategy is used.
- The operands are split in two, and a full product plus two mullo
- are used to obtain the final result. The more natural strategy is to
- split in two halves, but this is far from optimal when a
- sub-quadratic multiplication is used.
-
- Mulders suggests an unbalanced split in favour of the full product,
- split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
-
- To compute the value of a, we assume that the cost of mullo for a
- given size ML(n) is a fraction of the cost of a full product with
- same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
- then we can write:
-
- ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
-
- Given a value for e, want to minimise the value of k, i.e. the
- function k=(1-a)^e/(1-2*a^e).
-
- With e=2, the exponent for schoolbook multiplication, the minimum is
- given by the values a=1-a=1/2.
-
- With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
- Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
-
- Other possible approximations follow:
- e=log(5)/log(3) [Toom-3] -> a ~= 9/40
- e=log(7)/log(4) [Toom-4] -> a ~= 7/39
- e=log(11)/log(6) [Toom-6] -> a ~= 1/8
- e=log(15)/log(8) [Toom-8] -> a ~= 1/10
-
- The values above where obtained with the following trivial commands
- in the gp-pari shell:
-
-fun(e,a)=(1-a)^e/(1-2*a^e)
-mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
-contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
-contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
-contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
-contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
-contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
-
- ,
- |\
- | \
- +----,
- | |
- | |
- | |\
- | | \
- +----+--`
- ^ n2 ^n1^
-
- For an actual implementation, the assumption that M(n)=n^e is
- incorrect, as a consequence also the assumption that ML(n)=k*M(n)
- with a constant k is wrong.
-
- But theory suggest us two things:
- - the best the multiplication product is (lower e), the more k
- approaches 1, and a approaches 0.
-
- - A value for a smaller than optimal is probably less bad than a
- bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
- value, and k(a)=0.808_ the mul/mullo speed ratio. We get
- k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
-*/
-
-static mp_size_t
-mpn_mullo_n_itch (mp_size_t n)
-{
- return 2*n;
-}
-
-/*
- mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
- It accepts tp == rp.
-*/
-static void
-mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
-{
- mp_size_t n2, n1;
- ASSERT (n >= 2);
- ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
- ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
-
- /* Divide-and-conquer */
-
- /* We need fractional approximation of the value 0 < a <= 1/2
- giving the minimum in the function k=(1-a)^e/(1-2*a^e).
- */
- if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
- n1 = n >> 1;
- else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
- n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
- else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
- n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
- else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
- n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
- /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
- else
- n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
-
- n2 = n - n1;
-
- /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
- y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
-
- /* x0 * y0 */
- mpn_mul_n (tp, xp, yp, n2);
- MPN_COPY (rp, tp, n2);
-
- /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
- if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
- mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
- else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
- mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
- else
- mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
- mpn_add_n (rp + n2, tp + n2, tp + n, n1);
-
- /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
- if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
- mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
- else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
- mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
- else
- mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
- mpn_add_n (rp + n2, rp + n2, tp + n, n1);
-}
-
-/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
-#define MUL_BASECASE_ALLOC \
- (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
-
-/* FIXME: This function should accept a temporary area; dc_mullow_n
- accepts a pointer tp, and handle the case tp == rp, do the same here.
- Maybe recombine the two functions.
- THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
- (typically thanks to mpn_addmul_2) should we unconditionally use
- mpn_mul_n?
-*/
-
-void
-mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
-{
- ASSERT (n >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
-
- if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
- {
- /* Allocate workspace of fixed size on stack: fast! */
- mp_limb_t tp[MUL_BASECASE_ALLOC];
- mpn_mul_basecase (tp, xp, n, yp, n);
- MPN_COPY (rp, tp, n);
- }
- else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
- {
- mpn_mullo_basecase (rp, xp, yp, n);
- }
- else
- {
- mp_ptr tp;
- TMP_DECL;
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
- if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
- {
- mpn_dc_mullo_n (rp, xp, yp, n, tp);
- }
- else
- {
- /* For really large operands, use plain mpn_mul_n but throw away upper n
- limbs of result. */
-#if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
- mpn_fft_mul (tp, xp, n, yp, n);
-#else
- mpn_mul_n (tp, xp, yp, n);
-#endif
- MPN_COPY (rp, tp, n);
- }
- TMP_FREE;
- }
-}
diff --git a/gmp/mpn/generic/mullow_basecase.c b/gmp/mpn/generic/mullow_basecase.c
new file mode 100644
index 0000000000..72c48f65b4
--- /dev/null
+++ b/gmp/mpn/generic/mullow_basecase.c
@@ -0,0 +1,41 @@
+/* mpn_mullow_basecase -- Internal routine to multiply two natural
+ numbers of length m and n and return the low part.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 2000, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+ FIXME: Should use mpn_addmul_2 (and higher).
+*/
+
+void
+mpn_mullow_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+ mp_size_t i;
+
+ mpn_mul_1 (rp, up, n, vp[0]);
+
+ for (i = 1; i < n; i++)
+ mpn_addmul_1 (rp + i, up, n - i, vp[i]);
+}
diff --git a/gmp/mpn/generic/mullow_n.c b/gmp/mpn/generic/mullow_n.c
new file mode 100644
index 0000000000..e92a554616
--- /dev/null
+++ b/gmp/mpn/generic/mullow_n.c
@@ -0,0 +1,111 @@
+/* mpn_mullow_n -- multiply two n-limb nunbers and return the low n limbs
+ of their products.
+
+ THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
+ FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
+ THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2004, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#ifndef MULLOW_BASECASE_THRESHOLD
+#define MULLOW_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */
+#endif
+
+#ifndef MULLOW_DC_THRESHOLD
+#define MULLOW_DC_THRESHOLD 3*MUL_KARATSUBA_THRESHOLD
+#endif
+
+#ifndef MULLOW_MUL_N_THRESHOLD
+#define MULLOW_MUL_N_THRESHOLD 10*MULLOW_DC_THRESHOLD
+#endif
+
+/* Avoid zero allocations when MULLOW_BASECASE_THRESHOLD is 0. */
+#define MUL_BASECASE_ALLOC \
+ (MULLOW_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLOW_BASECASE_THRESHOLD_LIMIT)
+
+/*
+ FIXME: This function should accept a temporary area.
+ FIXME: Perhaps call mpn_kara_mul_n instead of mpn_mul_n?
+ THINK: If mpn_mul_basecase is always faster than mpn_mullow_basecase
+ (typically thanks to mpn_addmul_2) should we unconditionally use
+ mpn_mul_n?
+ FIXME: The recursive calls to mpn_mullow_n use sizes n/2 (one uses floor(n/2)
+ and the other ceil(n/2)). Depending on the values of the various
+ _THRESHOLDs, this may never trigger MULLOW_BASECASE_THRESHOLD.
+ Should we worry about this overhead?
+*/
+
+void
+mpn_mullow_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+ if (BELOW_THRESHOLD (n, MULLOW_BASECASE_THRESHOLD))
+ {
+ /* Allocate workspace of fixed size on stack: fast! */
+ mp_limb_t ws[MUL_BASECASE_ALLOC];
+ mpn_mul_basecase (ws, xp, n, yp, n);
+ MPN_COPY (rp, ws, n);
+ }
+ else if (BELOW_THRESHOLD (n, MULLOW_DC_THRESHOLD))
+ {
+ mpn_mullow_basecase (rp, xp, yp, n);
+ }
+ else if (BELOW_THRESHOLD (n, MULLOW_MUL_N_THRESHOLD))
+ {
+ /* Divide-and-conquer */
+ mp_size_t n2 = n >> 1; /* floor(n/2) */
+ mp_size_t n1 = n - n2; /* ceil(n/2) */
+ mp_ptr tp;
+ TMP_SDECL;
+ TMP_SMARK;
+ tp = TMP_SALLOC_LIMBS (n1);
+
+ /* Split as x = x1 2^(n1 GMP_NUMB_BITS) + x0,
+ y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
+
+ /* x0 * y0 */
+ mpn_mul_n (rp, xp, yp, n2);
+ if (n1 != n2)
+ rp[2 * n2] = mpn_addmul_1 (rp + n2, yp, n2, xp[n2]);
+
+ /* x1 * y0 * 2^(n1 GMP_NUMB_BITS) */
+ mpn_mullow_n (tp, xp + n1, yp, n2);
+ mpn_add_n (rp + n1, rp + n1, tp, n2);
+
+ /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
+ mpn_mullow_n (tp, yp + n2, xp, n1);
+ mpn_add_n (rp + n2, rp + n2, tp, n1);
+ TMP_SFREE;
+ }
+ else
+ {
+ /* For really large operands, use plain mpn_mul_n but throw away upper n
+ limbs of result. */
+ mp_ptr tp;
+ TMP_DECL;
+ TMP_MARK;
+ tp = TMP_ALLOC_LIMBS (2 * n);
+
+ mpn_mul_n (tp, xp, yp, n);
+ MPN_COPY (rp, tp, n);
+ TMP_FREE;
+ }
+}
diff --git a/gmp/mpn/generic/mulmid.c b/gmp/mpn/generic/mulmid.c
deleted file mode 100644
index 6b4ea3253d..0000000000
--- a/gmp/mpn/generic/mulmid.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/* mpn_mulmid -- middle product
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
-
-
-void
-mpn_mulmid (mp_ptr rp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn)
-{
- mp_size_t rn, k;
- mp_ptr scratch, temp;
-
- ASSERT (an >= bn);
- ASSERT (bn >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
- ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
-
- if (bn < MULMID_TOOM42_THRESHOLD)
- {
- /* region not tall enough to make toom42 worthwhile for any portion */
-
- if (an < CHUNK)
- {
- /* region not too wide either, just call basecase directly */
- mpn_mulmid_basecase (rp, ap, an, bp, bn);
- return;
- }
-
- /* Region quite wide. For better locality, use basecase on chunks:
-
- AAABBBCC..
- .AAABBBCC.
- ..AAABBBCC
- */
-
- k = CHUNK - bn + 1; /* number of diagonals per chunk */
-
- /* first chunk (marked A in the above diagram) */
- mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
-
- /* remaining chunks (B, C, etc) */
- an -= k;
-
- while (an >= CHUNK)
- {
- mp_limb_t t0, t1, cy;
- ap += k, rp += k;
- t0 = rp[0], t1 = rp[1];
- mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
- ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
- MPN_INCR_U (rp + 1, k + 1, t1 + cy);
- an -= k;
- }
-
- if (an >= bn)
- {
- /* last remaining chunk */
- mp_limb_t t0, t1, cy;
- ap += k, rp += k;
- t0 = rp[0], t1 = rp[1];
- mpn_mulmid_basecase (rp, ap, an, bp, bn);
- ADDC_LIMB (cy, rp[0], rp[0], t0);
- MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
- }
-
- return;
- }
-
- /* region is tall enough for toom42 */
-
- rn = an - bn + 1;
-
- if (rn < MULMID_TOOM42_THRESHOLD)
- {
- /* region not wide enough to make toom42 worthwhile for any portion */
-
- TMP_DECL;
-
- if (bn < CHUNK)
- {
- /* region not too tall either, just call basecase directly */
- mpn_mulmid_basecase (rp, ap, an, bp, bn);
- return;
- }
-
- /* Region quite tall. For better locality, use basecase on chunks:
-
- AAAAA....
- .AAAAA...
- ..BBBBB..
- ...BBBBB.
- ....CCCCC
- */
-
- TMP_MARK;
-
- temp = TMP_ALLOC_LIMBS (rn + 2);
-
- /* first chunk (marked A in the above diagram) */
- bp += bn - CHUNK, an -= bn - CHUNK;
- mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
-
- /* remaining chunks (B, C, etc) */
- bn -= CHUNK;
-
- while (bn >= CHUNK)
- {
- ap += CHUNK, bp -= CHUNK;
- mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
- mpn_add_n (rp, rp, temp, rn + 2);
- bn -= CHUNK;
- }
-
- if (bn)
- {
- /* last remaining chunk */
- ap += CHUNK, bp -= bn;
- mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
- mpn_add_n (rp, rp, temp, rn + 2);
- }
-
- TMP_FREE;
- return;
- }
-
- /* we're definitely going to use toom42 somewhere */
-
- if (bn > rn)
- {
- /* slice region into chunks, use toom42 on all chunks except possibly
- the last:
-
- AA....
- .AA...
- ..BB..
- ...BB.
- ....CC
- */
-
- TMP_DECL;
- TMP_MARK;
-
- temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
- scratch = temp + rn + 2;
-
- /* first chunk (marked A in the above diagram) */
- bp += bn - rn;
- mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
-
- /* remaining chunks (B, C, etc) */
- bn -= rn;
-
- while (bn >= rn)
- {
- ap += rn, bp -= rn;
- mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
- mpn_add_n (rp, rp, temp, rn + 2);
- bn -= rn;
- }
-
- if (bn)
- {
- /* last remaining chunk */
- ap += rn, bp -= bn;
- mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
- mpn_add_n (rp, rp, temp, rn + 2);
- }
-
- TMP_FREE;
- }
- else
- {
- /* slice region into chunks, use toom42 on all chunks except possibly
- the last:
-
- AAABBBCC..
- .AAABBBCC.
- ..AAABBBCC
- */
-
- TMP_DECL;
- TMP_MARK;
-
- scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
-
- /* first chunk (marked A in the above diagram) */
- mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
-
- /* remaining chunks (B, C, etc) */
- rn -= bn;
-
- while (rn >= bn)
- {
- mp_limb_t t0, t1, cy;
- ap += bn, rp += bn;
- t0 = rp[0], t1 = rp[1];
- mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
- ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
- MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
- rn -= bn;
- }
-
- TMP_FREE;
-
- if (rn)
- {
- /* last remaining chunk */
- mp_limb_t t0, t1, cy;
- ap += bn, rp += bn;
- t0 = rp[0], t1 = rp[1];
- mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
- ADDC_LIMB (cy, rp[0], rp[0], t0);
- MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
- }
- }
-}
diff --git a/gmp/mpn/generic/mulmid_basecase.c b/gmp/mpn/generic/mulmid_basecase.c
deleted file mode 100644
index 400e976424..0000000000
--- a/gmp/mpn/generic/mulmid_basecase.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* mpn_mulmid_basecase -- classical middle product algorithm
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Middle product of {up,un} and {vp,vn}, write result to {rp,un-vn+3}.
- Must have un >= vn >= 1.
-
- Neither input buffer may overlap with the output buffer. */
-
-void
-mpn_mulmid_basecase (mp_ptr rp,
- mp_srcptr up, mp_size_t un,
- mp_srcptr vp, mp_size_t vn)
-{
- mp_limb_t lo, hi; /* last two limbs of output */
- mp_limb_t cy;
-
- ASSERT (un >= vn);
- ASSERT (vn >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, up, un));
- ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, vp, vn));
-
- up += vn - 1;
- un -= vn - 1;
-
- /* multiply by first limb, store result */
- lo = mpn_mul_1 (rp, up, un, vp[0]);
- hi = 0;
-
- /* accumulate remaining rows */
- for (vn--; vn; vn--)
- {
- up--, vp++;
- cy = mpn_addmul_1 (rp, up, un, vp[0]);
- add_ssaaaa (hi, lo, hi, lo, CNST_LIMB(0), cy);
- }
-
- /* store final limbs */
-#if GMP_NAIL_BITS != 0
- hi = (hi << GMP_NAIL_BITS) + (lo >> GMP_NUMB_BITS);
- lo &= GMP_NUMB_MASK;
-#endif
-
- rp[un] = lo;
- rp[un + 1] = hi;
-}
diff --git a/gmp/mpn/generic/mulmid_n.c b/gmp/mpn/generic/mulmid_n.c
deleted file mode 100644
index 2280ba3a36..0000000000
--- a/gmp/mpn/generic/mulmid_n.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/* mpn_mulmid_n -- balanced middle product
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-void
-mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
-{
- ASSERT (n >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
- ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
-
- if (n < MULMID_TOOM42_THRESHOLD)
- {
- mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n);
- }
- else
- {
- mp_ptr scratch;
- TMP_DECL;
- TMP_MARK;
- scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (n));
- mpn_toom42_mulmid (rp, ap, bp, n, scratch);
- TMP_FREE;
- }
-}
diff --git a/gmp/mpn/generic/mulmod_bnm1.c b/gmp/mpn/generic/mulmod_bnm1.c
deleted file mode 100644
index 8710324583..0000000000
--- a/gmp/mpn/generic/mulmod_bnm1.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/* mulmod_bnm1.c -- multiplication mod B^n-1.
-
- Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
- Marco Bodrato.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
- mod B^rn - 1, and values are semi-normalised; zero is represented
- as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp.
- tp==rp is allowed. */
-void
-mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
- mp_ptr tp)
-{
- mp_limb_t cy;
-
- ASSERT (0 < rn);
-
- mpn_mul_n (tp, ap, bp, rn);
- cy = mpn_add_n (rp, tp, tp + rn, rn);
- /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
- * be no overflow when adding in the carry. */
- MPN_INCR_U (rp, rn, cy);
-}
-
-
-/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
- semi-normalised representation, computation is mod B^rn + 1. Needs
- a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
- Output is normalised. */
-static void
-mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
- mp_ptr tp)
-{
- mp_limb_t cy;
-
- ASSERT (0 < rn);
-
- mpn_mul_n (tp, ap, bp, rn + 1);
- ASSERT (tp[2*rn+1] == 0);
- ASSERT (tp[2*rn] < GMP_NUMB_MAX);
- cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
- rp[rn] = 0;
- MPN_INCR_U (rp, rn+1, cy );
-}
-
-
-/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
- *
- * The result is expected to be ZERO if and only if one of the operand
- * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
- * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
- * combine results and obtain a natural number when one knows in
- * advance that the final value is less than (B^rn-1).
- * Moreover it should not be a problem if mulmod_bnm1 is used to
- * compute the full product with an+bn <= rn, because this condition
- * implies (B^an-1)(B^bn-1) < (B^rn-1) .
- *
- * Requires 0 < bn <= an <= rn and an + bn > rn/2
- * Scratch need: rn + (need for recursive call OR rn + 4). This gives
- *
- * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
- */
-void
-mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
-{
- ASSERT (0 < bn);
- ASSERT (bn <= an);
- ASSERT (an <= rn);
-
- if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
- {
- if (UNLIKELY (bn < rn))
- {
- if (UNLIKELY (an + bn <= rn))
- {
- mpn_mul (rp, ap, an, bp, bn);
- }
- else
- {
- mp_limb_t cy;
- mpn_mul (tp, ap, an, bp, bn);
- cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
- MPN_INCR_U (rp, rn, cy);
- }
- }
- else
- mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
- }
- else
- {
- mp_size_t n;
- mp_limb_t cy;
- mp_limb_t hi;
-
- n = rn >> 1;
-
- /* We need at least an + bn >= n, to be able to fit one of the
- recursive products at rp. Requiring strict inequality makes
- the coded slightly simpler. If desired, we could avoid this
- restriction by initially halving rn as long as rn is even and
- an + bn <= rn/2. */
-
- ASSERT (an + bn > n);
-
- /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
- and crt together as
-
- x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
- */
-
-#define a0 ap
-#define a1 (ap + n)
-#define b0 bp
-#define b1 (bp + n)
-
-#define xp tp /* 2n + 2 */
- /* am1 maybe in {xp, n} */
- /* bm1 maybe in {xp + n, n} */
-#define sp1 (tp + 2*n + 2)
- /* ap1 maybe in {sp1, n + 1} */
- /* bp1 maybe in {sp1 + n + 1, n + 1} */
-
- {
- mp_srcptr am1, bm1;
- mp_size_t anm, bnm;
- mp_ptr so;
-
- bm1 = b0;
- bnm = bn;
- if (LIKELY (an > n))
- {
- am1 = xp;
- cy = mpn_add (xp, a0, n, a1, an - n);
- MPN_INCR_U (xp, n, cy);
- anm = n;
- so = xp + n;
- if (LIKELY (bn > n))
- {
- bm1 = so;
- cy = mpn_add (so, b0, n, b1, bn - n);
- MPN_INCR_U (so, n, cy);
- bnm = n;
- so += n;
- }
- }
- else
- {
- so = xp;
- am1 = a0;
- anm = an;
- }
-
- mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
- }
-
- {
- int k;
- mp_srcptr ap1, bp1;
- mp_size_t anp, bnp;
-
- bp1 = b0;
- bnp = bn;
- if (LIKELY (an > n)) {
- ap1 = sp1;
- cy = mpn_sub (sp1, a0, n, a1, an - n);
- sp1[n] = 0;
- MPN_INCR_U (sp1, n + 1, cy);
- anp = n + ap1[n];
- if (LIKELY (bn > n)) {
- bp1 = sp1 + n + 1;
- cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
- sp1[2*n+1] = 0;
- MPN_INCR_U (sp1 + n + 1, n + 1, cy);
- bnp = n + bp1[n];
- }
- } else {
- ap1 = a0;
- anp = an;
- }
-
- if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
- k=0;
- else
- {
- int mask;
- k = mpn_fft_best_k (n, 0);
- mask = (1<<k) - 1;
- while (n & mask) {k--; mask >>=1;};
- }
- if (k >= FFT_FIRST_K)
- xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
- else if (UNLIKELY (bp1 == b0))
- {
- ASSERT (anp + bnp <= 2*n+1);
- ASSERT (anp + bnp > n);
- ASSERT (anp >= bnp);
- mpn_mul (xp, ap1, anp, bp1, bnp);
- anp = anp + bnp - n;
- ASSERT (anp <= n || xp[2*n]==0);
- anp-= anp > n;
- cy = mpn_sub (xp, xp, n, xp + n, anp);
- xp[n] = 0;
- MPN_INCR_U (xp, n+1, cy);
- }
- else
- mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
- }
-
- /* Here the CRT recomposition begins.
-
- xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
- Division by 2 is a bitwise rotation.
-
- Assumes xp normalised mod (B^n+1).
-
- The residue class [0] is represented by [B^n-1]; except when
- both input are ZERO.
- */
-
-#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
-#if HAVE_NATIVE_mpn_rsh1add_nc
- cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
- hi = cy << (GMP_NUMB_BITS - 1);
- cy = 0;
- /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
- overflows, i.e. a further increment will not overflow again. */
-#else /* ! _nc */
- cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
- hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
- cy >>= 1;
- /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
- the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
-#endif
-#if GMP_NAIL_BITS == 0
- add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
-#else
- cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
- rp[n-1] ^= hi;
-#endif
-#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
-#if HAVE_NATIVE_mpn_add_nc
- cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
-#else /* ! _nc */
- cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
-#endif
- cy += (rp[0]&1);
- mpn_rshift(rp, rp, n, 1);
- ASSERT (cy <= 2);
- hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
- cy >>= 1;
- /* We can have cy != 0 only if hi = 0... */
- ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
- rp[n-1] |= hi;
- /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
-#endif
- ASSERT (cy <= 1);
- /* Next increment can not overflow, read the previous comments about cy. */
- ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
- MPN_INCR_U(rp, n, cy);
-
- /* Compute the highest half:
- ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
- */
- if (UNLIKELY (an + bn < rn))
- {
- /* Note that in this case, the only way the result can equal
- zero mod B^{rn} - 1 is if one of the inputs is zero, and
- then the output of both the recursive calls and this CRT
- reconstruction is zero, not B^{rn} - 1. Which is good,
- since the latter representation doesn't fit in the output
- area.*/
- cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
-
- /* FIXME: This subtraction of the high parts is not really
- necessary, we do it to get the carry out, and for sanity
- checking. */
- cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
- xp + an + bn - n, rn - (an + bn), cy);
- ASSERT (an + bn == rn - 1 ||
- mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
- cy = mpn_sub_1 (rp, rp, an + bn, cy);
- ASSERT (cy == (xp + an + bn - n)[0]);
- }
- else
- {
- cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
- /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
- DECR will affect _at most_ the lowest n limbs. */
- MPN_DECR_U (rp, 2*n, cy);
- }
-#undef a0
-#undef a1
-#undef b0
-#undef b1
-#undef xp
-#undef sp1
- }
-}
-
-mp_size_t
-mpn_mulmod_bnm1_next_size (mp_size_t n)
-{
- mp_size_t nh;
-
- if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD))
- return n;
- if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
- return (n + (2-1)) & (-2);
- if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
- return (n + (4-1)) & (-4);
-
- nh = (n + 1) >> 1;
-
- if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
- return (n + (8-1)) & (-8);
-
- return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
-}
diff --git a/gmp/mpn/generic/neg.c b/gmp/mpn/generic/neg.c
deleted file mode 100644
index 2d752e912d..0000000000
--- a/gmp/mpn/generic/neg.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/* mpn_neg - negate an mpn.
-
-Copyright 2001, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define __GMP_FORCE_mpn_neg 1
-
-#include "gmp.h"
-#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/neg_n.c b/gmp/mpn/generic/neg_n.c
new file mode 100644
index 0000000000..1609204c90
--- /dev/null
+++ b/gmp/mpn/generic/neg_n.c
@@ -0,0 +1,23 @@
+/* mpn_neg_n - negate an mpn.
+
+Copyright 2001, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define __GMP_FORCE_mpn_neg_n 1
+
+#include "gmp.h"
+#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/nussbaumer_mul.c b/gmp/mpn/generic/nussbaumer_mul.c
deleted file mode 100644
index d2bf19ad56..0000000000
--- a/gmp/mpn/generic/nussbaumer_mul.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/* mpn_nussbaumer_mul -- Multiply {ap,an} and {bp,bn} using
- Nussbaumer's negacyclic convolution.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Multiply {ap,an} by {bp,bn}, and put the result in {pp, an+bn} */
-void
-mpn_nussbaumer_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn)
-{
- mp_size_t rn;
- mp_ptr tp;
- TMP_DECL;
-
- ASSERT (an >= bn);
- ASSERT (bn > 0);
-
- TMP_MARK;
-
- if ((ap == bp) && (an == bn))
- {
- rn = mpn_sqrmod_bnm1_next_size (2*an);
- tp = TMP_ALLOC_LIMBS (mpn_sqrmod_bnm1_itch (rn, an));
- mpn_sqrmod_bnm1 (pp, rn, ap, an, tp);
- }
- else
- {
- rn = mpn_mulmod_bnm1_next_size (an + bn);
- tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn, an, bn));
- mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp);
- }
-
- TMP_FREE;
-}
diff --git a/gmp/mpn/generic/perfpow.c b/gmp/mpn/generic/perfpow.c
deleted file mode 100644
index bbed6309d5..0000000000
--- a/gmp/mpn/generic/perfpow.c
+++ /dev/null
@@ -1,417 +0,0 @@
-/* mpn_perfect_power_p -- mpn perfect power detection.
-
- Contributed to the GNU project by Martin Boij.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#define SMALL 20
-#define MEDIUM 100
-
-/* Return non-zero if {np,nn} == {xp,xn} ^ k.
- Algorithm:
- For s = 1, 2, 4, ..., s_max, compute the s least significant limbs of
- {xp,xn}^k. Stop if they don't match the s least significant limbs of
- {np,nn}.
-
- FIXME: Low xn limbs can be expected to always match, if computed as a mod
- B^{xn} root. So instead of using mpn_powlo, compute an approximation of the
- most significant (normalized) limb of {xp,xn} ^ k (and an error bound), and
- compare to {np, nn}. Or use an even cruder approximation based on fix-point
- base 2 logarithm. */
-static int
-pow_equals (mp_srcptr np, mp_size_t n,
- mp_srcptr xp,mp_size_t xn,
- mp_limb_t k, mp_bitcnt_t f,
- mp_ptr tp)
-{
- mp_limb_t *tp2;
- mp_bitcnt_t y, z;
- mp_size_t i, bn;
- int ans;
- mp_limb_t h, l;
- TMP_DECL;
-
- ASSERT (n > 1 || (n == 1 && np[0] > 1));
- ASSERT (np[n - 1] > 0);
- ASSERT (xn > 0);
-
- if (xn == 1 && xp[0] == 1)
- return 0;
-
- z = 1 + (n >> 1);
- for (bn = 1; bn < z; bn <<= 1)
- {
- mpn_powlo (tp, xp, &k, 1, bn, tp + bn);
- if (mpn_cmp (tp, np, bn) != 0)
- return 0;
- }
-
- TMP_MARK;
-
- /* Final check. Estimate the size of {xp,xn}^k before computing the power
- with full precision. Optimization: It might pay off to make a more
- accurate estimation of the logarithm of {xp,xn}, rather than using the
- index of the MSB. */
-
- MPN_SIZEINBASE_2EXP(y, xp, xn, 1);
- y -= 1; /* msb_index (xp, xn) */
-
- umul_ppmm (h, l, k, y);
- h -= l == 0; l--; /* two-limb decrement */
-
- z = f - 1; /* msb_index (np, n) */
- if (h == 0 && l <= z)
- {
- mp_limb_t size;
- size = l + k;
- ASSERT_ALWAYS (size >= k);
-
- y = 2 + size / GMP_LIMB_BITS;
- tp2 = TMP_ALLOC_LIMBS (y);
-
- i = mpn_pow_1 (tp, xp, xn, k, tp2);
- if (i == n && mpn_cmp (tp, np, n) == 0)
- ans = 1;
- else
- ans = 0;
- }
- else
- {
- ans = 0;
- }
-
- TMP_FREE;
- return ans;
-}
-
-
-/* Return non-zero if N = {np,n} is a kth power.
- I = {ip,n} = N^(-1) mod B^n. */
-static int
-is_kth_power (mp_ptr rp, mp_srcptr np,
- mp_limb_t k, mp_srcptr ip,
- mp_size_t n, mp_bitcnt_t f,
- mp_ptr tp)
-{
- mp_bitcnt_t b;
- mp_size_t rn, xn;
-
- ASSERT (n > 0);
- ASSERT ((k & 1) != 0 || k == 2);
- ASSERT ((np[0] & 1) != 0);
-
- if (k == 2)
- {
- b = (f + 1) >> 1;
- rn = 1 + b / GMP_LIMB_BITS;
- if (mpn_bsqrtinv (rp, ip, b, tp) != 0)
- {
- rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
- xn = rn;
- MPN_NORMALIZE (rp, xn);
- if (pow_equals (np, n, rp, xn, k, f, tp) != 0)
- return 1;
-
- /* Check if (2^b - r)^2 == n */
- mpn_neg (rp, rp, rn);
- rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
- MPN_NORMALIZE (rp, rn);
- if (pow_equals (np, n, rp, rn, k, f, tp) != 0)
- return 1;
- }
- }
- else
- {
- b = 1 + (f - 1) / k;
- rn = 1 + (b - 1) / GMP_LIMB_BITS;
- mpn_brootinv (rp, ip, rn, k, tp);
- if ((b % GMP_LIMB_BITS) != 0)
- rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
- MPN_NORMALIZE (rp, rn);
- if (pow_equals (np, n, rp, rn, k, f, tp) != 0)
- return 1;
- }
- MPN_ZERO (rp, rn); /* Untrash rp */
- return 0;
-}
-
-static int
-perfpow (mp_srcptr np, mp_size_t n,
- mp_limb_t ub, mp_limb_t g,
- mp_bitcnt_t f, int neg)
-{
- mp_ptr ip, tp, rp;
- mp_limb_t k;
- int ans;
- mp_bitcnt_t b;
- gmp_primesieve_t ps;
- TMP_DECL;
-
- ASSERT (n > 0);
- ASSERT ((np[0] & 1) != 0);
- ASSERT (ub > 0);
-
- TMP_MARK;
- gmp_init_primesieve (&ps);
- b = (f + 3) >> 1;
-
- ip = TMP_ALLOC_LIMBS (n);
- rp = TMP_ALLOC_LIMBS (n);
- tp = TMP_ALLOC_LIMBS (5 * n); /* FIXME */
- MPN_ZERO (rp, n);
-
- /* FIXME: It seems the inverse in ninv is needed only to get non-inverted
- roots. I.e., is_kth_power computes n^{1/2} as (n^{-1})^{-1/2} and
- similarly for nth roots. It should be more efficient to compute n^{1/2} as
- n * n^{-1/2}, with a mullo instead of a binvert. And we can do something
- similar for kth roots if we switch to an iteration converging to n^{1/k -
- 1}, and we can then eliminate this binvert call. */
- mpn_binvert (ip, np, 1 + (b - 1) / GMP_LIMB_BITS, tp);
- if (b % GMP_LIMB_BITS)
- ip[(b - 1) / GMP_LIMB_BITS] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
-
- if (neg)
- gmp_nextprime (&ps);
-
- ans = 0;
- if (g > 0)
- {
- ub = MIN (ub, g + 1);
- while ((k = gmp_nextprime (&ps)) < ub)
- {
- if ((g % k) == 0)
- {
- if (is_kth_power (rp, np, k, ip, n, f, tp) != 0)
- {
- ans = 1;
- goto ret;
- }
- }
- }
- }
- else
- {
- while ((k = gmp_nextprime (&ps)) < ub)
- {
- if (is_kth_power (rp, np, k, ip, n, f, tp) != 0)
- {
- ans = 1;
- goto ret;
- }
- }
- }
- ret:
- TMP_FREE;
- return ans;
-}
-
-static const unsigned short nrtrial[] = { 100, 500, 1000 };
-
-/* Table of (log_{p_i} 2) values, where p_i is the (nrtrial[i] + 1)'th prime
- number. */
-static const double logs[] =
- { 0.1099457228193620, 0.0847016403115322, 0.0772048195144415 };
-
-int
-mpn_perfect_power_p (mp_srcptr np, mp_size_t n)
-{
- mp_size_t ncn, s, pn, xn;
- mp_limb_t *nc, factor, g;
- mp_limb_t exp, *prev, *next, d, l, r, c, *tp, cry;
- mp_bitcnt_t twos, count;
- int ans, where, neg, trial;
- TMP_DECL;
-
- nc = (mp_ptr) np;
-
- neg = 0;
- if (n < 0)
- {
- neg = 1;
- n = -n;
- }
-
- if (n == 0 || (n == 1 && np[0] == 1))
- return 1;
-
- TMP_MARK;
-
- g = 0;
-
- ncn = n;
- twos = mpn_scan1 (np, 0);
- if (twos > 0)
- {
- if (twos == 1)
- {
- ans = 0;
- goto ret;
- }
- s = twos / GMP_LIMB_BITS;
- if (s + 1 == n && POW2_P (np[s]))
- {
- ans = ! (neg && POW2_P (twos));
- goto ret;
- }
- count = twos % GMP_LIMB_BITS;
- ncn = n - s;
- nc = TMP_ALLOC_LIMBS (ncn);
- if (count > 0)
- {
- mpn_rshift (nc, np + s, ncn, count);
- ncn -= (nc[ncn - 1] == 0);
- }
- else
- {
- MPN_COPY (nc, np + s, ncn);
- }
- g = twos;
- }
-
- if (ncn <= SMALL)
- trial = 0;
- else if (ncn <= MEDIUM)
- trial = 1;
- else
- trial = 2;
-
- where = 0;
- factor = mpn_trialdiv (nc, ncn, nrtrial[trial], &where);
-
- if (factor != 0)
- {
- if (twos == 0)
- {
- nc = TMP_ALLOC_LIMBS (ncn);
- MPN_COPY (nc, np, ncn);
- }
-
- /* Remove factors found by trialdiv. Optimization: Perhaps better to use
- the strategy in mpz_remove (). */
- prev = TMP_ALLOC_LIMBS (ncn + 2);
- next = TMP_ALLOC_LIMBS (ncn + 2);
- tp = TMP_ALLOC_LIMBS (4 * ncn);
-
- do
- {
- binvert_limb (d, factor);
- prev[0] = d;
- pn = 1;
- exp = 1;
- while (2 * pn - 1 <= ncn)
- {
- mpn_sqr (next, prev, pn);
- xn = 2 * pn;
- xn -= (next[xn - 1] == 0);
-
- if (mpn_divisible_p (nc, ncn, next, xn) == 0)
- break;
-
- exp <<= 1;
- pn = xn;
- MP_PTR_SWAP (next, prev);
- }
-
- /* Binary search for the exponent */
- l = exp + 1;
- r = 2 * exp - 1;
- while (l <= r)
- {
- c = (l + r) >> 1;
- if (c - exp > 1)
- {
- xn = mpn_pow_1 (tp, &d, 1, c - exp, next);
- if (pn + xn - 1 > ncn)
- {
- r = c - 1;
- continue;
- }
- mpn_mul (next, prev, pn, tp, xn);
- xn += pn;
- xn -= (next[xn - 1] == 0);
- }
- else
- {
- cry = mpn_mul_1 (next, prev, pn, d);
- next[pn] = cry;
- xn = pn + (cry != 0);
- }
-
- if (mpn_divisible_p (nc, ncn, next, xn) == 0)
- {
- r = c - 1;
- }
- else
- {
- exp = c;
- l = c + 1;
- MP_PTR_SWAP (next, prev);
- pn = xn;
- }
- }
-
- if (g == 0)
- g = exp;
- else
- g = mpn_gcd_1 (&g, 1, exp);
-
- if (g == 1)
- {
- ans = 0;
- goto ret;
- }
-
- mpn_divexact (next, nc, ncn, prev, pn);
- ncn = ncn - pn;
- ncn += next[ncn] != 0;
- MPN_COPY (nc, next, ncn);
-
- if (ncn == 1 && nc[0] == 1)
- {
- ans = ! (neg && POW2_P (g));
- goto ret;
- }
-
- factor = mpn_trialdiv (nc, ncn, nrtrial[trial], &where);
- }
- while (factor != 0);
- }
-
- MPN_SIZEINBASE_2EXP(count, nc, ncn, 1); /* log (nc) + 1 */
- d = (mp_limb_t) (count * logs[trial] + 1e-9) + 1;
- ans = perfpow (nc, ncn, d, g, count, neg);
-
- ret:
- TMP_FREE;
- return ans;
-}
diff --git a/gmp/mpn/generic/perfsqr.c b/gmp/mpn/generic/perfsqr.c
index bdd82ccd96..1995a944df 100644
--- a/gmp/mpn/generic/perfsqr.c
+++ b/gmp/mpn/generic/perfsqr.c
@@ -1,34 +1,23 @@
/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
zero otherwise.
-Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
+Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <stdio.h> /* for NULL */
#include "gmp.h"
@@ -113,20 +102,20 @@ see https://www.gnu.org/licenses/. */
/* FIXME: The %= here isn't good, and might destroy any savings from keeping
the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
- and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our
- normal case, so lets not worry too much about mod_1. */
-#define PERFSQR_MOD_PP(r, up, usize) \
- do { \
- if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \
- { \
- (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \
- PERFSQR_PP_INVERTED); \
- (r) %= PERFSQR_PP; \
- } \
- else \
- { \
- (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \
- } \
+ and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our
+ normal case, so lets not worry too much about mod_1. */
+#define PERFSQR_MOD_PP(r, up, usize) \
+ do { \
+ if (USE_PREINV_MOD_1) \
+ { \
+ (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \
+ PERFSQR_PP_INVERTED); \
+ (r) %= PERFSQR_PP; \
+ } \
+ else \
+ { \
+ (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \
+ } \
} while (0)
#define PERFSQR_MOD_IDX(idx, r, d, inv) \
@@ -156,7 +145,7 @@ see https://www.gnu.org/licenses/. */
} while (0)
/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
- sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */
+ sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */
#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \
do { \
mp_limb_t m; \
@@ -196,7 +185,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
/* Check that we have even multiplicity of 2, and then check that the rest is
a possible perfect square. Leave disabled until we can determine this
really is an improvement. It it is, it could completely replace the
- simple probe above, since this should throw out more non-squares, but at
+ simple probe above, since this should through out more non-squares, but at
the expense of somewhat more cycles. */
{
mp_limb_t lo;
@@ -229,7 +218,7 @@ mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
TMP_DECL;
TMP_MARK;
- root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
+ root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
/* Iff mpn_sqrtrem returns zero, the square is perfect. */
res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
diff --git a/gmp/mpn/generic/popham.c b/gmp/mpn/generic/popham.c
index 13e529b7cd..be7c525036 100644
--- a/gmp/mpn/generic/popham.c
+++ b/gmp/mpn/generic/popham.c
@@ -1,33 +1,21 @@
/* mpn_popcount, mpn_hamdist -- mpn bit population count/hamming distance.
-Copyright 1994, 1996, 2000-2002, 2005, 2011, 2012 Free Software Foundation,
-Inc.
+Copyright 1994, 1996, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -42,14 +30,14 @@ see https://www.gnu.org/licenses/. */
#define POPHAM(u,v) u ^ v
#endif
-mp_bitcnt_t
+unsigned long
FNAME (mp_srcptr up,
#if OPERATION_hamdist
mp_srcptr vp,
#endif
- mp_size_t n) __GMP_NOTHROW
+ mp_size_t n)
{
- mp_bitcnt_t result = 0;
+ unsigned long result = 0;
mp_limb_t p0, p1, p2, p3, x, p01, p23;
mp_size_t i;
diff --git a/gmp/mpn/generic/pow_1.c b/gmp/mpn/generic/pow_1.c
index 2333206554..4bc9f434bc 100644
--- a/gmp/mpn/generic/pow_1.c
+++ b/gmp/mpn/generic/pow_1.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2002, 2014 Free Software Foundation, Inc.
+Copyright 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
@@ -45,9 +34,6 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
mp_size_t rn;
int par;
- ASSERT (bn >= 1);
- /* FIXME: Add operand overlap criteria */
-
if (exp <= 1)
{
if (exp == 0)
@@ -68,13 +54,11 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
so much time that the slowness of this code will be negligible. */
par = 0;
cnt = GMP_LIMB_BITS;
- x = exp;
- do
+ for (x = exp; x != 0; x >>= 1)
{
- par ^= x;
+ par ^= x & 1;
cnt--;
- x >>= 1;
- } while (x != 0);
+ }
exp <<= cnt;
if (bn == 1)
@@ -84,7 +68,7 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
if ((cnt & 1) != 0)
MP_PTR_SWAP (rp, tp);
- mpn_sqr (rp, bp, bn);
+ mpn_sqr_n (rp, bp, bn);
rn = 2 * bn; rn -= rp[rn - 1] == 0;
for (i = GMP_LIMB_BITS - cnt - 1;;)
@@ -99,7 +83,7 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
if (--i == 0)
break;
- mpn_sqr (tp, rp, rn);
+ mpn_sqr_n (tp, rp, rn);
rn = 2 * rn; rn -= tp[rn - 1] == 0;
MP_PTR_SWAP (rp, tp);
}
@@ -109,7 +93,7 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
if (((par ^ cnt) & 1) == 0)
MP_PTR_SWAP (rp, tp);
- mpn_sqr (rp, bp, bn);
+ mpn_sqr_n (rp, bp, bn);
rn = 2 * bn; rn -= rp[rn - 1] == 0;
for (i = GMP_LIMB_BITS - cnt - 1;;)
@@ -124,7 +108,7 @@ mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
if (--i == 0)
break;
- mpn_sqr (tp, rp, rn);
+ mpn_sqr_n (tp, rp, rn);
rn = 2 * rn; rn -= tp[rn - 1] == 0;
MP_PTR_SWAP (rp, tp);
}
diff --git a/gmp/mpn/generic/powlo.c b/gmp/mpn/generic/powlo.c
index adcd96eb51..ca3e1e9448 100644
--- a/gmp/mpn/generic/powlo.c
+++ b/gmp/mpn/generic/powlo.c
@@ -1,32 +1,21 @@
-/* mpn_powlo -- Compute R = U^E mod B^n, where B is the limb base.
+/* mpn_powlo -- Compute R = U^E mod R^n, where R is the limb base.
-Copyright 2007-2009, 2012 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
@@ -38,7 +27,7 @@ see https://www.gnu.org/licenses/. */
((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
static inline mp_limb_t
-getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
+getbits (const mp_limb_t *p, unsigned long bi, int nbits)
{
int nbits_in_r;
mp_limb_t r;
@@ -51,10 +40,10 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
else
{
bi -= nbits; /* bit index of low bit to extract */
- i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */
- bi %= GMP_NUMB_BITS; /* bit index in low word */
+ i = bi / GMP_LIMB_BITS; /* word index of low bit to extract */
+ bi %= GMP_LIMB_BITS; /* bit index in low word */
r = p[i] >> bi; /* extract (low) bits */
- nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */
+ nbits_in_r = GMP_LIMB_BITS - bi; /* number of bits now in r */
if (nbits_in_r < nbits) /* did we get enough bits? */
r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
return r & (((mp_limb_t ) 1 << nbits) - 1);
@@ -62,16 +51,16 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
}
static inline int
-win_size (mp_bitcnt_t eb)
+win_size (unsigned long eb)
{
int k;
- static mp_bitcnt_t x[] = {1,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
+ static unsigned long x[] = {1,7,25,81,241,673,1793,4609,11521,28161,~0ul};
for (k = 0; eb > x[k]; k++)
;
return k;
}
-/* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod B^n, B is the limb base.
+/* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod R^n, R is the limb base.
Requires that ep[en-1] is non-zero.
Uses scratch space tp[3n-1..0], i.e., 3n words. */
void
@@ -80,7 +69,7 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
mp_size_t n, mp_ptr tp)
{
int cnt;
- mp_bitcnt_t ebi;
+ long ebi;
int windowsize, this_windowsize;
mp_limb_t expbits;
mp_limb_t *pp, *this_pp, *last_pp;
@@ -92,11 +81,12 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
TMP_MARK;
- MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
+ count_leading_zeros (cnt, ep[en - 1]);
+ ebi = en * GMP_LIMB_BITS - cnt;
windowsize = win_size (ebi);
- pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1)) + n); /* + n is for mullo ign part */
+ pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1)) + n); /* + n is for mullow ign part */
this_pp = pp;
@@ -105,7 +95,7 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
b2p = tp + 2*n;
/* Store b^2 in b2. */
- mpn_sqr (tp, bp, n); /* FIXME: Use "mpn_sqrlo" */
+ mpn_sqr_n (tp, bp, n); /* FIXME: Use "mpn_sqrlo" */
MPN_COPY (b2p, tp, n);
/* Precompute odd powers of b and put them in the temporary area at pp. */
@@ -113,14 +103,13 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
{
last_pp = this_pp;
this_pp += n;
- mpn_mullo_n (this_pp, last_pp, b2p, n);
+ mpn_mullow_n (this_pp, last_pp, b2p, n);
}
expbits = getbits (ep, ebi, windowsize);
- if (ebi < windowsize)
+ ebi -= windowsize;
+ if (ebi < 0)
ebi = 0;
- else
- ebi -= windowsize;
count_trailing_zeros (cnt, expbits);
ebi += cnt;
@@ -132,7 +121,7 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
{
while (getbit (ep, ebi) == 0)
{
- mpn_sqr (tp, rp, n); /* FIXME: Use "mpn_sqrlo" */
+ mpn_sqr_n (tp, rp, n); /* FIXME: Use "mpn_sqrlo" */
MPN_COPY (rp, tp, n);
ebi--;
if (ebi == 0)
@@ -143,14 +132,13 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
bits <= windowsize, and such that the least significant bit is 1. */
expbits = getbits (ep, ebi, windowsize);
+ ebi -= windowsize;
this_windowsize = windowsize;
- if (ebi < windowsize)
+ if (ebi < 0)
{
- this_windowsize -= windowsize - ebi;
+ this_windowsize += ebi;
ebi = 0;
}
- else
- ebi -= windowsize;
count_trailing_zeros (cnt, expbits);
this_windowsize -= cnt;
@@ -159,13 +147,13 @@ mpn_powlo (mp_ptr rp, mp_srcptr bp,
do
{
- mpn_sqr (tp, rp, n);
+ mpn_sqr_n (tp, rp, n);
MPN_COPY (rp, tp, n);
this_windowsize--;
}
while (this_windowsize != 0);
- mpn_mullo_n (tp, rp, pp + n * (expbits >> 1), n);
+ mpn_mullow_n (tp, rp, pp + n * (expbits >> 1), n);
MPN_COPY (rp, tp, n);
}
diff --git a/gmp/mpn/generic/powm.c b/gmp/mpn/generic/powm.c
index 9968116016..c057ec2156 100644
--- a/gmp/mpn/generic/powm.c
+++ b/gmp/mpn/generic/powm.c
@@ -1,51 +1,37 @@
/* mpn_powm -- Compute R = U^E mod M.
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2007-2012 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/*
- BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
+ BASIC ALGORITHM, Compute b^e mod n, where n is odd.
- 1. W <- U
+ 1. w <- b
- 2. T <- (B^n * U) mod M Convert to REDC form
+ 2. While w^2 < n (and there are more bits in e)
+ w <- power left-to-right base-2 without reduction
- 3. Compute table U^1, U^3, U^5... of E-dependent size
+ 3. t <- (B^n * b) / n Convert to REDC form
- 4. While there are more bits in E
- W <- power left-to-right base-k
+ 4. Compute power table of e-dependent size
+
+ 5. While there are more bits in e
+ w <- power left-to-right base-k with reduction
TODO:
@@ -54,64 +40,51 @@ see https://www.gnu.org/licenses/. */
That will simplify the code using getbits. (Perhaps make getbits' sibling
getbit then have similar form, for symmetry.)
- * Write an itch function. Or perhaps get rid of tp parameter since the huge
- pp area is allocated locally anyway?
+ * Write an itch function.
* Choose window size without looping. (Superoptimize or think(tm).)
- * Handle small bases with initial, reduction-free exponentiation.
+ * How do we handle small bases?
+
+ * This is slower than old mpz code, in particular if we base it on redc_1
+ (use: #undef HAVE_NATIVE_mpn_addmul_2). Why?
+
+ * Make it sub-quadratic.
* Call new division functions, not mpn_tdiv_qr.
+ * Is redc obsolete with improved SB division?
+
* Consider special code for one-limb M.
- * How should we handle the redc1/redc2/redc_n choice?
- - redc1: T(binvert_1limb) + e * (n) * (T(mullo-1x1) + n*T(addmul_1))
- - redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
- - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
+ * CRT for N = odd*2^t:
+ Using Newton's method and 2-adic arithmetic:
+ m1_inv_m2 = 1/odd mod 2^t
+ Plain 2-adic (REDC) modexp:
+ r1 = a ^ b mod odd
+ Mullo+sqrlo-based modexp:
+ r2 = a ^ b mod 2^t
+ mullo, mul, add:
+ r = ((r2 - r1) * m1_i_m2 mod 2^t) * odd + r1
+
+ * How should we handle the redc1/redc2/redc2/redc4/redc_subquad choice?
+ - redc1: T(binvert_1limb) + e * (n) * (T(mullo1x1) + n*T(addmul_1))
+ - redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo2x2) + n*T(addmul_2))
+ - redc3: T(binvert_3limbs) + e * (n/3) * (T(mullo3x3) + n*T(addmul_3))
This disregards the addmul_N constant term, but we could think of
- that as part of the respective mullo.
-
- * When U (the base) is small, we should start the exponentiation with plain
- operations, then convert that partial result to REDC form.
-
- * When U is just one limb, should it be handled without the k-ary tricks?
- We could keep a factor of B^n in W, but use U' = BU as base. After
- multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
- mod M.
+ that as part of the respective mulloNxN.
*/
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#undef MPN_REDC_1
-#define MPN_REDC_1(rp, up, mp, n, invm) \
- do { \
- mp_limb_t cy; \
- cy = mpn_redc_1 (rp, up, mp, n, invm); \
- if (cy != 0) \
- mpn_sub_n (rp, rp, mp, n); \
- } while (0)
-
-#undef MPN_REDC_2
-#define MPN_REDC_2(rp, up, mp, n, mip) \
- do { \
- mp_limb_t cy; \
- cy = mpn_redc_2 (rp, up, mp, n, mip); \
- if (cy != 0) \
- mpn_sub_n (rp, rp, mp, n); \
- } while (0)
-
-#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
-#define WANT_REDC_2 1
-#endif
#define getbit(p,bi) \
((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
static inline mp_limb_t
-getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
+getbits (const mp_limb_t *p, unsigned long bi, int nbits)
{
int nbits_in_r;
mp_limb_t r;
@@ -124,27 +97,49 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
else
{
bi -= nbits; /* bit index of low bit to extract */
- i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */
- bi %= GMP_NUMB_BITS; /* bit index in low word */
+ i = bi / GMP_LIMB_BITS; /* word index of low bit to extract */
+ bi %= GMP_LIMB_BITS; /* bit index in low word */
r = p[i] >> bi; /* extract (low) bits */
- nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */
+ nbits_in_r = GMP_LIMB_BITS - bi; /* number of bits now in r */
if (nbits_in_r < nbits) /* did we get enough bits? */
r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
return r & (((mp_limb_t ) 1 << nbits) - 1);
}
}
+#undef HAVE_NATIVE_mpn_addmul_2
+
+#ifndef HAVE_NATIVE_mpn_addmul_2
+#define REDC_2_THRESHOLD MP_SIZE_T_MAX
+#endif
+
+#ifndef REDC_2_THRESHOLD
+#define REDC_2_THRESHOLD 4
+#endif
+
+static void mpn_redc_n () {ASSERT_ALWAYS(0);}
+
static inline int
-win_size (mp_bitcnt_t eb)
+win_size (unsigned long eb)
{
int k;
- static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
- for (k = 1; eb > x[k]; k++)
+ static unsigned long x[] = {1,7,25,81,241,673,1793,4609,11521,28161,~0ul};
+ for (k = 0; eb > x[k]; k++)
;
return k;
}
-/* Convert U to REDC form, U_r = B^n * U mod M */
+#define MPN_REDC_X(rp, tp, mp, n, mip) \
+ do { \
+ if (redc_x == 1) \
+ mpn_redc_1 (rp, tp, mp, n, mip[0]); \
+ else if (redc_x == 2) \
+ mpn_redc_2 (rp, tp, mp, n, mip); \
+ else \
+ mpn_redc_n (rp, tp, mp, n, mip); \
+ } while (0)
+
+ /* Convert U to REDC form, U_r = B^n * U mod M */
static void
redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
{
@@ -164,19 +159,21 @@ redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
Requires that mp[n-1..0] is odd.
Requires that ep[en-1..0] is > 1.
- Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */
+ Uses scratch space tp[3n..0], i.e., 3n+1 words. */
void
mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mp_srcptr ep, mp_size_t en,
mp_srcptr mp, mp_size_t n, mp_ptr tp)
{
- mp_limb_t ip[2], *mip;
+ mp_limb_t mip[2];
int cnt;
- mp_bitcnt_t ebi;
+ long ebi;
int windowsize, this_windowsize;
mp_limb_t expbits;
- mp_ptr pp, this_pp;
+ mp_ptr pp, this_pp, last_pp;
+ mp_ptr b2p;
long i;
+ int redc_x;
TMP_DECL;
ASSERT (en > 1 || (en == 1 && ep[0] > 1));
@@ -184,7 +181,8 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
TMP_MARK;
- MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
+ count_leading_zeros (cnt, ep[en - 1]);
+ ebi = en * GMP_LIMB_BITS - cnt;
#if 0
if (bn < n)
@@ -193,7 +191,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
until the result is greater than the mod argument. */
for (;;)
{
- mpn_sqr (tp, this_pp, tn);
+ mpn_sqr_n (tp, this_pp, tn);
tn = tn * 2 - 1, tn += tp[tn] != 0;
if (getbit (ep, ebi) != 0)
mpn_mul (..., tp, tn, bp, bn);
@@ -204,75 +202,49 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
windowsize = win_size (ebi);
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+ if (BELOW_THRESHOLD (n, REDC_2_THRESHOLD))
{
- mip = ip;
binvert_limb (mip[0], mp[0]);
mip[0] = -mip[0];
+ redc_x = 1;
}
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+#if defined (HAVE_NATIVE_mpn_addmul_2)
+ else
{
- mip = ip;
mpn_binvert (mip, mp, 2, tp);
mip[0] = -mip[0]; mip[1] = ~mip[1];
- }
-#else
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- {
- mip = ip;
- binvert_limb (mip[0], mp[0]);
- mip[0] = -mip[0];
+ redc_x = 2;
}
#endif
- else
- {
- mip = TMP_ALLOC_LIMBS (n);
- mpn_binvert (mip, mp, n, tp);
- }
+#if 0
+ mpn_binvert (mip, mp, n, tp);
+ redc_x = 0;
+#endif
pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));
this_pp = pp;
redcify (this_pp, bp, bn, mp, n);
- /* Store b^2 at rp. */
- mpn_sqr (tp, this_pp, n);
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- MPN_REDC_1 (rp, tp, mp, n, mip[0]);
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
- MPN_REDC_2 (rp, tp, mp, n, mip);
-#else
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- MPN_REDC_1 (rp, tp, mp, n, mip[0]);
-#endif
- else
- mpn_redc_n (rp, tp, mp, n, mip);
+ b2p = tp + 2*n;
+
+ /* Store b^2 in b2. */
+ mpn_sqr_n (tp, this_pp, n);
+ MPN_REDC_X (b2p, tp, mp, n, mip);
/* Precompute odd powers of b and put them in the temporary area at pp. */
for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
{
- mpn_mul_n (tp, this_pp, rp, n);
+ last_pp = this_pp;
this_pp += n;
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
- MPN_REDC_2 (this_pp, tp, mp, n, mip);
-#else
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
-#endif
- else
- mpn_redc_n (this_pp, tp, mp, n, mip);
+ mpn_mul_n (tp, last_pp, b2p, n);
+ MPN_REDC_X (this_pp, tp, mp, n, mip);
}
expbits = getbits (ep, ebi, windowsize);
- if (ebi < windowsize)
+ ebi -= windowsize;
+ if (ebi < 0)
ebi = 0;
- else
- ebi -= windowsize;
count_trailing_zeros (cnt, expbits);
ebi += cnt;
@@ -280,311 +252,51 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
MPN_COPY (rp, pp + n * (expbits >> 1), n);
-#define INNERLOOP \
- while (ebi != 0) \
- { \
- while (getbit (ep, ebi) == 0) \
- { \
- MPN_SQR (tp, rp, n); \
- MPN_REDUCE (rp, tp, mp, n, mip); \
- ebi--; \
- if (ebi == 0) \
- goto done; \
- } \
- \
- /* The next bit of the exponent is 1. Now extract the largest \
- block of bits <= windowsize, and such that the least \
- significant bit is 1. */ \
- \
- expbits = getbits (ep, ebi, windowsize); \
- this_windowsize = windowsize; \
- if (ebi < windowsize) \
- { \
- this_windowsize -= windowsize - ebi; \
- ebi = 0; \
- } \
- else \
- ebi -= windowsize; \
- \
- count_trailing_zeros (cnt, expbits); \
- this_windowsize -= cnt; \
- ebi += cnt; \
- expbits >>= cnt; \
- \
- do \
- { \
- MPN_SQR (tp, rp, n); \
- MPN_REDUCE (rp, tp, mp, n, mip); \
- this_windowsize--; \
- } \
- while (this_windowsize != 0); \
- \
- MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n); \
- MPN_REDUCE (rp, tp, mp, n, mip); \
- }
-
-
-#if WANT_REDC_2
- if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
- {
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- {
- if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- }
- else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
- {
- if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- }
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- }
- else
+ while (ebi != 0)
{
- if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+ while (getbit (ep, ebi) == 0)
{
- if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- }
- else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
+ mpn_sqr_n (tp, rp, n);
+ MPN_REDC_X (rp, tp, mp, n, mip);
+ ebi--;
+ if (ebi == 0)
+ goto done;
}
- }
-#else /* WANT_REDC_2 */
+ /* The next bit of the exponent is 1. Now extract the largest block of
+ bits <= windowsize, and such that the least significant bit is 1. */
- if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
- {
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- {
- if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- }
- else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+ expbits = getbits (ep, ebi, windowsize);
+ ebi -= windowsize;
+ this_windowsize = windowsize;
+ if (ebi < 0)
{
- if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
- }
+ this_windowsize += ebi;
+ ebi = 0;
}
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
- }
- }
- else
- {
- if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
- {
- if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
- || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- }
- else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
+
+ count_trailing_zeros (cnt, expbits);
+ this_windowsize -= cnt;
+ ebi += cnt;
+ expbits >>= cnt;
+
+ do
{
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
- INNERLOOP;
+ mpn_sqr_n (tp, rp, n);
+ MPN_REDC_X (rp, tp, mp, n, mip);
+ this_windowsize--;
}
+ while (this_windowsize != 0);
+
+ mpn_mul_n (tp, rp, pp + n * (expbits >> 1), n);
+ MPN_REDC_X (rp, tp, mp, n, mip);
}
-#endif /* WANT_REDC_2 */
done:
-
MPN_COPY (tp, rp, n);
MPN_ZERO (tp + n, n);
-
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- MPN_REDC_1 (rp, tp, mp, n, mip[0]);
- else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
- MPN_REDC_2 (rp, tp, mp, n, mip);
-#else
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- MPN_REDC_1 (rp, tp, mp, n, mip[0]);
-#endif
- else
- mpn_redc_n (rp, tp, mp, n, mip);
-
+ MPN_REDC_X (rp, tp, mp, n, mip);
if (mpn_cmp (rp, mp, n) >= 0)
mpn_sub_n (rp, rp, mp, n);
-
TMP_FREE;
}
diff --git a/gmp/mpn/generic/powm_sec.c b/gmp/mpn/generic/powm_sec.c
new file mode 100644
index 0000000000..26d77b5c81
--- /dev/null
+++ b/gmp/mpn/generic/powm_sec.c
@@ -0,0 +1,272 @@
+/* mpn_powm_sec -- Compute R = U^E mod M. Safe variant, not leaking time info.
+
+Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+
+/*
+ BASIC ALGORITHM, Compute b^e mod n, where n is odd.
+
+ 1. w <- b
+
+ 2. While w^2 < n (and there are more bits in e)
+ w <- power left-to-right base-2 without reduction
+
+ 3. t <- (B^n * b) / n Convert to REDC form
+
+ 4. Compute power table of e-dependent size
+
+ 5. While there are more bits in e
+ w <- power left-to-right base-k with reduction
+
+
+ TODO:
+
+ * Make getbits a macro, thereby allowing it to update the index operand.
+ That will simplify the code using getbits. (Perhaps make getbits' sibling
+ getbit then have similar form, for symmetry.)
+
+ * Write an itch function.
+
+ * Choose window size without looping. (Superoptimize or think(tm).)
+
+ * Make it sub-quadratic.
+
+ * Call new division functions, not mpn_tdiv_qr.
+
+ * Is redc obsolete with improved SB division?
+
+ * Consider special code for one-limb M.
+
+ * Handle even M (in mpz_powm_sec) with two modexps and CRT.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define WANT_CACHE_SECURITY 1
+
+
+#define getbit(p,bi) \
+ ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
+
+static inline mp_limb_t
+getbits (const mp_limb_t *p, unsigned long bi, int nbits)
+{
+ int nbits_in_r;
+ mp_limb_t r;
+ mp_size_t i;
+
+ if (bi < nbits)
+ {
+ return p[0] & (((mp_limb_t) 1 << bi) - 1);
+ }
+ else
+ {
+ bi -= nbits; /* bit index of low bit to extract */
+ i = bi / GMP_LIMB_BITS; /* word index of low bit to extract */
+ bi %= GMP_LIMB_BITS; /* bit index in low word */
+ r = p[i] >> bi; /* extract (low) bits */
+ nbits_in_r = GMP_LIMB_BITS - bi; /* number of bits now in r */
+ if (nbits_in_r < nbits) /* did we get enough bits? */
+ r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
+ return r & (((mp_limb_t ) 1 << nbits) - 1);
+ }
+}
+
+#undef HAVE_NATIVE_mpn_addmul_2
+
+#ifndef HAVE_NATIVE_mpn_addmul_2
+#define REDC_2_THRESHOLD MP_SIZE_T_MAX
+#endif
+
+#ifndef REDC_2_THRESHOLD
+#define REDC_2_THRESHOLD 4
+#endif
+
+static void mpn_redc_n () {ASSERT_ALWAYS(0);}
+
+static inline int
+win_size (unsigned long eb)
+{
+ int k;
+ static unsigned long x[] = {1,4,27,100,325,1026,2905,7848,20457,51670,~0ul};
+ for (k = 0; eb > x[k]; k++)
+ ;
+ return k;
+}
+
+#define MPN_REDC_X(rp, tp, mp, n, mip) \
+ do { \
+ if (redc_x == 1) \
+ mpn_redc_1 (rp, tp, mp, n, mip[0]); \
+ else if (redc_x == 2) \
+ mpn_redc_2 (rp, tp, mp, n, mip); \
+ else \
+ mpn_redc_n (rp, tp, mp, n, mip); \
+ } while (0)
+
+ /* Convert U to REDC form, U_r = B^n * U mod M */
+static void
+redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
+{
+ mp_ptr tp, qp;
+ TMP_DECL;
+ TMP_MARK;
+
+ tp = TMP_ALLOC_LIMBS (un + n);
+ qp = TMP_ALLOC_LIMBS (un + 1); /* FIXME: Put at tp+? */
+
+ MPN_ZERO (tp, n);
+ MPN_COPY (tp + n, up, un);
+ mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
+ TMP_FREE;
+}
+
+/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
+ Requires that mp[n-1..0] is odd.
+ Requires that ep[en-1..0] is > 1.
+ Uses scratch space tp[3n..0], i.e., 3n+1 words. */
+void
+mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
+ mp_srcptr ep, mp_size_t en,
+ mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+ mp_limb_t mip[2];
+ int cnt;
+ long ebi;
+ int windowsize, this_windowsize;
+ mp_limb_t expbits;
+ mp_ptr pp, this_pp, last_pp;
+ long i;
+ int redc_x;
+ TMP_DECL;
+
+ ASSERT (en > 1 || (en == 1 && ep[0] > 1));
+ ASSERT (n >= 1 && ((mp[0] & 1) != 0));
+
+ TMP_MARK;
+
+ count_leading_zeros (cnt, ep[en - 1]);
+ ebi = en * GMP_LIMB_BITS - cnt;
+
+ windowsize = win_size (ebi);
+
+ if (BELOW_THRESHOLD (n, REDC_2_THRESHOLD))
+ {
+ binvert_limb (mip[0], mp[0]);
+ mip[0] = -mip[0];
+ redc_x = 1;
+ }
+#if defined (HAVE_NATIVE_mpn_addmul_2)
+ else
+ {
+ mpn_binvert (mip, mp, 2, tp);
+ mip[0] = -mip[0]; mip[1] = ~mip[1];
+ redc_x = 2;
+ }
+#endif
+#if 0
+ mpn_binvert (mip, mp, n, tp);
+ redc_x = 0;
+#endif
+
+ pp = TMP_ALLOC_LIMBS (n << windowsize);
+
+ this_pp = pp;
+ this_pp[n] = 1;
+ redcify (this_pp, this_pp + n, 1, mp, n);
+ this_pp += n;
+ redcify (this_pp, bp, bn, mp, n);
+
+ /* Precompute powers of b and put them in the temporary area at pp. */
+ for (i = (1 << windowsize) - 2; i > 0; i--)
+ {
+ last_pp = this_pp;
+ this_pp += n;
+ mpn_mul_n (tp, last_pp, pp + n, n);
+ MPN_REDC_X (this_pp, tp, mp, n, mip);
+ }
+
+ expbits = getbits (ep, ebi, windowsize);
+ ebi -= windowsize;
+ if (ebi < 0)
+ ebi = 0;
+
+ MPN_COPY (rp, pp + n * expbits, n);
+
+ while (ebi != 0)
+ {
+ expbits = getbits (ep, ebi, windowsize);
+ ebi -= windowsize;
+ this_windowsize = windowsize;
+ if (ebi < 0)
+ {
+ this_windowsize += ebi;
+ ebi = 0;
+ }
+
+ do
+ {
+ mpn_sqr_n (tp, rp, n);
+ MPN_REDC_X (rp, tp, mp, n, mip);
+ this_windowsize--;
+ }
+ while (this_windowsize != 0);
+
+#if WANT_CACHE_SECURITY
+ mpn_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits);
+ mpn_mul_n (tp, rp, tp + 2*n, n);
+#else
+ mpn_mul_n (tp, rp, pp + n * expbits, n);
+#endif
+ MPN_REDC_X (rp, tp, mp, n, mip);
+ }
+
+ MPN_COPY (tp, rp, n);
+ MPN_ZERO (tp + n, n);
+ MPN_REDC_X (rp, tp, mp, n, mip);
+ if (mpn_cmp (rp, mp, n) >= 0)
+ mpn_sub_n (rp, rp, mp, n);
+ TMP_FREE;
+}
+
+#if ! HAVE_NATIVE_mpn_tabselect
+/* Select entry `which' from table `tab', which has nents entries, each `n'
+ limbs. Store the selected entry at rp. Reads entire table to avoid
+ sideband information leaks. O(n*nents). */
+
+void
+mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
+ mp_size_t nents, mp_size_t which)
+{
+ mp_size_t k, i;
+ mp_limb_t mask;
+ volatile mp_limb_t *tp;
+
+ for (k = 0; k < nents; k++)
+ {
+ mask = -(mp_limb_t) (which == k);
+ tp = tab + n * k;
+ for (i = 0; i < n; i++)
+ {
+ rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
+ }
+ }
+}
+#endif
diff --git a/gmp/mpn/generic/pre_divrem_1.c b/gmp/mpn/generic/pre_divrem_1.c
index 8027f0216e..6badf63192 100644
--- a/gmp/mpn/generic/pre_divrem_1.c
+++ b/gmp/mpn/generic/pre_divrem_1.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2003 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -43,8 +32,8 @@ see https://www.gnu.org/licenses/. */
/* Same test here for skipping one divide step as in mpn_divrem_1.
The main reason for a separate shift==0 case is that not all CPUs give
- zero for "n0 >> GMP_LIMB_BITS" which would arise in the general case
- code used on shift==0. shift==0 is also reasonably common in mp_bases
+ zero for "n0 >> BITS_PER_MP_LIMB" which would arise in the general case
+ code used on shift==0. shift==0 is also reasonably common in __mp_bases
big_base, for instance base==10 on a 64-bit limb.
Under shift!=0 it would be possible to call mpn_lshift to adjust the
@@ -117,14 +106,14 @@ mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t xsize,
}
n1 = ap[size-1];
- r |= n1 >> (GMP_LIMB_BITS - shift);
+ r |= n1 >> (BITS_PER_MP_LIMB - shift);
for (i = size-2; i >= 0; i--)
{
ASSERT (r < d);
n0 = ap[i];
udiv_qrnnd_preinv (*qp, r, r,
- ((n1 << shift) | (n0 >> (GMP_LIMB_BITS - shift))),
+ ((n1 << shift) | (n0 >> (BITS_PER_MP_LIMB - shift))),
d, dinv);
qp--;
n1 = n0;
diff --git a/gmp/mpn/generic/pre_mod_1.c b/gmp/mpn/generic/pre_mod_1.c
index cb38f4a48f..961733ba34 100644
--- a/gmp/mpn/generic/pre_mod_1.c
+++ b/gmp/mpn/generic/pre_mod_1.c
@@ -2,34 +2,23 @@
DINV should be 2^(2*GMP_LIMB_BITS) / D - 2^GMP_LIMB_BITS.
Return the single-limb remainder.
-Copyright 1991, 1993, 1994, 2000-2002, 2004, 2005 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2004, 2005 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -45,6 +34,7 @@ mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv)
{
mp_size_t i;
mp_limb_t n0, r;
+ mp_limb_t dummy;
ASSERT (un >= 1);
ASSERT (d & GMP_LIMB_HIGHBIT);
@@ -56,7 +46,7 @@ mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv)
for (i = un - 2; i >= 0; i--)
{
n0 = up[i];
- udiv_rnnd_preinv (r, r, n0, d, dinv);
+ udiv_qrnnd_preinv (dummy, r, r, n0, d, dinv);
}
return r;
}
diff --git a/gmp/mpn/generic/random.c b/gmp/mpn/generic/random.c
index 5489becf4d..c0b85ea075 100644
--- a/gmp/mpn/generic/random.c
+++ b/gmp/mpn/generic/random.c
@@ -5,28 +5,17 @@ Copyright 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/random2.c b/gmp/mpn/generic/random2.c
index 980b15367f..e29238c514 100644
--- a/gmp/mpn/generic/random2.c
+++ b/gmp/mpn/generic/random2.c
@@ -1,38 +1,28 @@
/* mpn_random2 -- Generate random numbers with relatively long strings
of ones and zeroes. Suitable for border testing.
-Copyright 1992-1994, 1996, 2000-2002, 2004, 2012 Free Software Foundation, Inc.
+Copyright 1992, 1993, 1994, 1996, 2000, 2001, 2002, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
-static void gmp_rrandomb (mp_ptr, gmp_randstate_t, mp_bitcnt_t);
+static void gmp_rrandomb __GMP_PROTO ((mp_ptr, gmp_randstate_t, unsigned long int));
/* Ask _gmp_rand for 32 bits per call unless that's more than a limb can hold.
Thus, we get the same random number sequence in the common cases.
@@ -64,15 +54,15 @@ mpn_random2 (mp_ptr rp, mp_size_t n)
}
static void
-gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, mp_bitcnt_t nbits)
+gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)
{
- mp_bitcnt_t bi;
+ unsigned long int bi;
mp_limb_t ranm; /* buffer for random bits */
unsigned cap_chunksize, chunksize;
mp_size_t i;
/* Set entire result to 111..1 */
- i = BITS_TO_LIMBS (nbits) - 1;
+ i = (nbits + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS - 1;
rp[i] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - (nbits % GMP_NUMB_BITS)) % GMP_NUMB_BITS;
for (i = i - 1; i >= 0; i--)
rp[i] = GMP_NUMB_MAX;
diff --git a/gmp/mpn/generic/redc_1.c b/gmp/mpn/generic/redc_1.c
index 0d33421f63..47bee8220b 100644
--- a/gmp/mpn/generic/redc_1.c
+++ b/gmp/mpn/generic/redc_1.c
@@ -1,57 +1,43 @@
-/* mpn_redc_1. Set rp[] <- up[]/R^n mod mp[]. Clobber up[].
+/* mpn_redc_1. Set cp[] <- up[]/R^n mod mp[]. Clobber up[].
mp[] is n limbs; up[] is 2n limbs.
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright (C) 2000-2002, 2004, 2008, 2009, 2012 Free Software Foundation, Inc.
+Copyright (C) 2000, 2001, 2002, 2004, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
-mp_limb_t
+void
mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
{
mp_size_t j;
mp_limb_t cy;
- ASSERT (n > 0);
ASSERT_MPN (up, 2*n);
for (j = n - 1; j >= 0; j--)
{
- cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
- ASSERT (up[0] == 0);
- up[0] = cy;
+ up[0] = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
up++;
}
-
cy = mpn_add_n (rp, up, up - n, n);
- return cy;
+ if (cy != 0)
+ mpn_sub_n (rp, rp, mp, n);
}
diff --git a/gmp/mpn/generic/redc_2.c b/gmp/mpn/generic/redc_2.c
index 07d90fa20d..0efbd9d4c7 100644
--- a/gmp/mpn/generic/redc_2.c
+++ b/gmp/mpn/generic/redc_2.c
@@ -1,36 +1,25 @@
-/* mpn_redc_2. Set rp[] <- up[]/R^n mod mp[]. Clobber up[].
+/* mpn_redc_2. Set cp[] <- up[]/R^n mod mp[]. Clobber up[].
mp[] is n limbs; up[] is 2n limbs.
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright (C) 2000-2002, 2004, 2008, 2012 Free Software Foundation, Inc.
+Copyright (C) 2000, 2001, 2002, 2004, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -44,8 +33,7 @@ you lose
/* For testing purposes, define our own mpn_addmul_2 if there is none already
available. */
#ifndef HAVE_NATIVE_mpn_addmul_2
-#undef mpn_addmul_2
-static mp_limb_t
+mp_limb_t
mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp)
{
rp[n] = mpn_addmul_1 (rp, up, n, vp[0]);
@@ -53,7 +41,7 @@ mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp)
}
#endif
-#if defined (__GNUC__) && defined (__ia64) && W_TYPE_SIZE == 64
+#if defined (__ia64) && W_TYPE_SIZE == 64
#define umul2low(ph, pl, uh, ul, vh, vl) \
do { \
mp_limb_t _ph, _pl; \
@@ -78,7 +66,7 @@ mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp)
} while (0)
#endif
-mp_limb_t
+void
mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip)
{
mp_limb_t q[2];
@@ -86,7 +74,6 @@ mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip)
mp_limb_t upn;
mp_limb_t cy;
- ASSERT (n > 0);
ASSERT_MPN (up, 2*n);
if ((n & 1) != 0)
@@ -104,7 +91,7 @@ mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip)
up[n] = upn;
up += 2;
}
-
cy = mpn_add_n (rp, up, up - n, n);
- return cy;
+ if (cy != 0)
+ mpn_sub_n (rp, rp, mp, n);
}
diff --git a/gmp/mpn/generic/redc_n.c b/gmp/mpn/generic/redc_n.c
deleted file mode 100644
index c3d0cfe7fa..0000000000
--- a/gmp/mpn/generic/redc_n.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/* mpn_redc_n. Set rp[] <- up[]/R^n mod mp[]. Clobber up[].
- mp[] is n limbs; up[] is 2n limbs, the inverse ip[] is n limbs.
-
- THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-
-Copyright 2009, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- TODO
-
- * We assume mpn_mulmod_bnm1 is always faster than plain mpn_mul_n (or a
- future mpn_mulhi) for the range we will be called. Follow up that
- assumption.
-
- * Decrease scratch usage.
-
- * Consider removing the residue canonicalisation.
-*/
-
-void
-mpn_redc_n (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr ip)
-{
- mp_ptr xp, yp, scratch;
- mp_limb_t cy;
- mp_size_t rn;
- TMP_DECL;
- TMP_MARK;
-
- ASSERT (n > 8);
-
- rn = mpn_mulmod_bnm1_next_size (n);
-
- scratch = TMP_ALLOC_LIMBS (n + rn + mpn_mulmod_bnm1_itch (rn, n, n));
-
- xp = scratch;
- mpn_mullo_n (xp, up, ip, n);
-
- yp = scratch + n;
- mpn_mulmod_bnm1 (yp, rn, xp, n, mp, n, scratch + n + rn);
-
- ASSERT_ALWAYS (2 * n > rn); /* could handle this */
-
- cy = mpn_sub_n (yp + rn, yp, up, 2*n - rn); /* undo wrap around */
- MPN_DECR_U (yp + 2*n - rn, rn, cy);
-
- cy = mpn_sub_n (rp, up + n, yp + n, n);
- if (cy != 0)
- mpn_add_n (rp, rp, mp, n);
-
- TMP_FREE;
-}
diff --git a/gmp/mpn/generic/remove.c b/gmp/mpn/generic/remove.c
deleted file mode 100644
index ef1a06ea14..0000000000
--- a/gmp/mpn/generic/remove.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/* mpn_remove -- divide out all multiples of odd mpn number from another mpn
- number.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2009, 2012, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if GMP_LIMB_BITS > 50
-#define LOG 50
-#else
-#define LOG GMP_LIMB_BITS
-#endif
-
-
-/* Input: U = {up,un}, V = {vp,vn} must be odd, cap
- Ouput W = {wp,*wn} allocation need is exactly *wn
-
- Set W = U / V^k, where k is the largest integer <= cap such that the
- division yields an integer.
-
- FIXME: We currently allow any operand overlap. This is quite non mpn-ish
- and might be changed, since it cost significant temporary space.
- * If we require W to have space for un + 1 limbs, we could save qp or qp2
- (but we will still need to copy things into wp 50% of the time).
- * If we allow ourselves to clobber U, we could save the other of qp and qp2,
- and the initial COPY (but also here we would need un + 1 limbs).
-*/
-
-/* FIXME: We need to wrap mpn_bdiv_qr due to the itch interface. This need
- indicates a flaw in the current itch mechanism: Which operands not greater
- than un,un will incur the worst itch? We need a parallel foo_maxitch set
- of functions. */
-static void
-mpn_bdiv_qr_wrap (mp_ptr qp, mp_ptr rp,
- mp_srcptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn)
-{
- mp_ptr scratch_out;
- TMP_DECL;
-
- TMP_MARK;
- scratch_out = TMP_ALLOC_LIMBS (mpn_bdiv_qr_itch (nn, dn));
- mpn_bdiv_qr (qp, rp, np, nn, dp, dn, scratch_out);
-
- TMP_FREE;
-}
-
-mp_bitcnt_t
-mpn_remove (mp_ptr wp, mp_size_t *wn,
- mp_ptr up, mp_size_t un, mp_ptr vp, mp_size_t vn,
- mp_bitcnt_t cap)
-{
- mp_ptr pwpsp[LOG];
- mp_size_t pwpsn[LOG];
- mp_size_t npowers;
- mp_ptr tp, qp, np, pp, qp2;
- mp_size_t pn, nn, qn, i;
- mp_bitcnt_t pwr;
- TMP_DECL;
-
- ASSERT (un > 0);
- ASSERT (vn > 0);
- ASSERT (vp[0] % 2 != 0); /* 2-adic division wants odd numbers */
- ASSERT (vn > 1 || vp[0] > 1); /* else we would loop indefinitely */
-
- TMP_MARK;
-
- tp = TMP_ALLOC_LIMBS ((un + 1 + vn) / 2); /* remainder */
- qp = TMP_ALLOC_LIMBS (un + 1); /* quotient, alternating */
- qp2 = TMP_ALLOC_LIMBS (un + 1); /* quotient, alternating */
- pp = vp;
- pn = vn;
-
- MPN_COPY (qp, up, un);
- qn = un;
-
- npowers = 0;
- while (qn >= pn)
- {
- qp[qn] = 0;
- mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn);
- if (!mpn_zero_p (tp, pn))
- break; /* could not divide by V^npowers */
-
- MP_PTR_SWAP (qp, qp2);
- qn = qn - pn;
- qn += qp[qn] != 0;
-
- pwpsp[npowers] = pp;
- pwpsn[npowers] = pn;
- npowers++;
-
- if (((mp_bitcnt_t) 2 << npowers) - 1 > cap)
- break;
-
- nn = 2 * pn - 1; /* next power will be at least this large */
- if (nn > qn)
- break; /* next power would be overlarge */
-
- if (npowers == 1) /* Alloc once, but only if it's needed */
- np = TMP_ALLOC_LIMBS (qn + LOG); /* powers of V */
- else
- np += pn;
-
- mpn_sqr (np, pp, pn);
- pn = nn + (np[nn] != 0);
- pp = np;
- }
-
- pwr = ((mp_bitcnt_t) 1 << npowers) - 1;
-
- for (i = npowers - 1; i >= 0; i--)
- {
- pn = pwpsn[i];
- if (qn < pn)
- continue;
-
- if (pwr + ((mp_bitcnt_t) 1 << i) > cap)
- continue; /* V^i would bring us past cap */
-
- qp[qn] = 0;
- mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pwpsp[i], pn);
- if (!mpn_zero_p (tp, pn))
- continue; /* could not divide by V^i */
-
- MP_PTR_SWAP (qp, qp2);
- qn = qn - pn;
- qn += qp[qn] != 0;
-
- pwr += (mp_bitcnt_t) 1 << i;
- }
-
- MPN_COPY (wp, qp, qn);
- *wn = qn;
-
- TMP_FREE;
-
- return pwr;
-}
diff --git a/gmp/mpn/generic/rootrem.c b/gmp/mpn/generic/rootrem.c
index 2edc74baa3..657e543ab3 100644
--- a/gmp/mpn/generic/rootrem.c
+++ b/gmp/mpn/generic/rootrem.c
@@ -8,37 +8,29 @@
ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT'S ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2002, 2005, 2009-2012 Free Software Foundation, Inc.
+Copyright 2002, 2005, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* FIXME:
- This implementation is not optimal when remp == NULL, since the complexity
- is M(n), whereas it should be M(n/k) on average.
+ (a) Once there is a native mpn_tdiv_q function in GMP (division without
+ remainder), replace the quick-and-dirty implementation below by it.
+ (b) The implementation below is not optimal when remp == NULL, since the
+ complexity is M(n) where n is the input size, whereas it should be
+ only M(n/k) on average.
*/
#include <stdio.h> /* for NULL */
@@ -49,6 +41,8 @@ see https://www.gnu.org/licenses/. */
static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
mp_limb_t, int);
+static void mpn_tdiv_q (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
+ mp_srcptr, mp_size_t);
#define MPN_RSHIFT(cy,rp,up,un,cnt) \
do { \
@@ -90,15 +84,14 @@ mp_size_t
mpn_rootrem (mp_ptr rootp, mp_ptr remp,
mp_srcptr up, mp_size_t un, mp_limb_t k)
{
- mp_size_t m;
ASSERT (un > 0);
ASSERT (up[un - 1] != 0);
ASSERT (k > 1);
- m = (un - 1) / k; /* ceil(un/k) - 1 */
- if (remp == NULL && m > 2)
- /* Pad {up,un} with k zero limbs. This will produce an approximate root
- with one more limb, allowing us to compute the exact integral result. */
+ if ((remp == NULL) && (un / k > 2))
+ /* call mpn_rootrem recursively, padding {up,un} with k zero limbs,
+ which will produce an approximate root with one more limb,
+ so that in most cases we can conclude. */
{
mp_ptr sp, wp;
mp_size_t rn, sn, wn;
@@ -106,21 +99,21 @@ mpn_rootrem (mp_ptr rootp, mp_ptr remp,
TMP_MARK;
wn = un + k;
wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
- sn = m + 2; /* ceil(un/k) + 1 */
+ sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
MPN_COPY (wp + k, up, un);
MPN_ZERO (wp, k);
rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
- /* The approximate root S = {sp,sn} is either the correct root of
- {sp,sn}, or 1 too large. Thus unless the least significant limb of
- S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
- limb. (In case sp[0]=1, we can deduce the root, but not decide
+ /* the approximate root S = {sp,sn} is either the correct root of
+ {sp,sn}, or one too large. Thus unless the least significant limb
+ of S is 0 or 1, we can deduce the root of {up,un} is S truncated by
+ one limb. (In case sp[0]=1, we can deduce the root, but not decide
whether it is exact or not.) */
MPN_COPY (rootp, sp + 1, sn - 1);
TMP_FREE;
return rn;
}
- else
+ else /* remp <> NULL */
{
return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
}
@@ -131,11 +124,12 @@ static mp_size_t
mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
mp_limb_t k, int approx)
{
- mp_ptr qp, rp, sp, wp, scratch;
+ mp_ptr qp, rp, sp, wp;
mp_size_t qn, rn, sn, wn, nl, bn;
mp_limb_t save, save2, cy;
unsigned long int unb; /* number of significant bits of {up,un} */
unsigned long int xnb; /* number of significant bits of the result */
+ unsigned int cnt;
unsigned long b, kk;
unsigned long sizes[GMP_NUMB_BITS + 1];
int ni, i;
@@ -145,19 +139,25 @@ mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
TMP_MARK;
+ /* qp and wp need enough space to store S'^k where S' is an approximate
+ root. Since S' can be as large as S+2, the worst case is when S=2 and
+ S'=4. But then since we know the number of bits of S in advance, S'
+ can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
+ So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
+ fits in un limbs, the number of extra limbs needed is bounded by
+ ceil(k*log2(3/2)/GMP_NUMB_BITS). */
+#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
+ qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder
+ of R/(k*S^(k-1)), and S^k */
if (remp == NULL)
- {
- rp = TMP_ALLOC_LIMBS (un + 1); /* will contain the remainder */
- scratch = rp; /* used by mpn_div_q */
- }
+ rp = TMP_ALLOC_LIMBS (un); /* will contain the remainder */
else
- {
- scratch = TMP_ALLOC_LIMBS (un + 1); /* used by mpn_div_q */
- rp = remp;
- }
+ rp = remp;
sp = rootp;
-
- MPN_SIZEINBASE_2EXP(unb, up, un, 1);
+ wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
+ and temporary for mpn_pow_1 */
+ count_leading_zeros (cnt, up[un - 1]);
+ unb = un * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS;
/* unb is the number of bits of the input U */
xnb = (unb - 1) / k + 1; /* ceil (unb / k) */
@@ -216,19 +216,6 @@ mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
Newton iteration will first compute sizes[ni-1] extra bits,
then sizes[ni-2], ..., then sizes[0] = b. */
- /* qp and wp need enough space to store S'^k where S' is an approximate
- root. Since S' can be as large as S+2, the worst case is when S=2 and
- S'=4. But then since we know the number of bits of S in advance, S'
- can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
- So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
- fits in un limbs, the number of extra limbs needed is bounded by
- ceil(k*log2(3/2)/GMP_NUMB_BITS). */
-#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
- qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder
- of R/(k*S^(k-1)), and S^k */
- wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
- and temporary for mpn_pow_1 */
-
wp[0] = 1; /* {sp,sn}^(k-1) = 1 */
wn = 1;
for (i = ni; i != 0; i--)
@@ -304,8 +291,13 @@ mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
}
else
{
+ mp_ptr tp;
qn = rn - wn; /* expected quotient size */
- mpn_div_q (qp, rp, rn, wp, wn, scratch);
+ /* tp must have space for wn limbs.
+ The quotient needs rn-wn+1 limbs, thus quotient+remainder
+ need altogether rn+1 limbs. */
+ tp = qp + qn + 1; /* put remainder in Q buffer */
+ mpn_tdiv_q (qp, tp, 0, rp, rn, wp, wn);
qn += qp[qn] != 0;
}
@@ -400,7 +392,7 @@ mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
ASSERT_ALWAYS (rn >= qn);
/* R = R - Q = floor(U/2^kk) - S^k */
- if (i > 1 || approx == 0)
+ if ((i > 1) || (approx == 0))
{
mpn_sub (rp, rp, rn, qp, qn);
MPN_NORMALIZE (rp, rn);
@@ -413,3 +405,47 @@ mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
TMP_FREE;
return rn;
}
+
+/* return the quotient Q = {np, nn} divided by {dp, dn} only */
+static void
+mpn_tdiv_q (mp_ptr qp, mp_ptr rp, mp_size_t qxn, mp_srcptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn)
+{
+ mp_size_t qn = nn - dn; /* expected quotient size is qn+1 */
+ mp_size_t cut;
+
+ ASSERT_ALWAYS (qxn == 0);
+ if (dn <= qn + 3)
+ {
+ mpn_tdiv_qr (qp, rp, 0, np, nn, dp, dn);
+ }
+ else
+ {
+ mp_ptr tp;
+ TMP_DECL;
+ TMP_MARK;
+ tp = TMP_ALLOC_LIMBS (qn + 2);
+ cut = dn - (qn + 3);
+ /* perform a first division with divisor cut to dn-cut=qn+3 limbs
+ and dividend to nn-(cut-1) limbs, i.e. the quotient will be one
+ limb more than the final quotient.
+ The quotient will have qn+2 < dn-cut limbs,
+ and the remainder dn-cut = qn+3 limbs. */
+ mpn_tdiv_qr (tp, rp, 0, np + cut - 1, nn - cut + 1, dp + cut, dn - cut);
+ /* let Q' be the quotient of B * {np, nn} by {dp, dn} [qn+2 limbs]
+ and T be the approximation of Q' computed above, where
+ B = 2^GMP_NUMB_BITS.
+ We have Q' <= T <= Q'+1, and since floor(Q'/B) = Q, we have
+ Q = floor(T/B), unless the last limb of T only consists of zeroes. */
+ if (tp[0] != 0)
+ {
+ /* simply truncate one limb of T */
+ MPN_COPY (qp, tp + 1, qn + 1);
+ }
+ else /* too bad: perform the expensive division */
+ {
+ mpn_tdiv_qr (qp, rp, 0, np, nn, dp, dn);
+ }
+ TMP_FREE;
+ }
+}
diff --git a/gmp/mpn/generic/rshift.c b/gmp/mpn/generic/rshift.c
index ec61f2f7e2..62256656de 100644
--- a/gmp/mpn/generic/rshift.c
+++ b/gmp/mpn/generic/rshift.c
@@ -1,32 +1,22 @@
/* mpn_rshift -- Shift right low level.
-Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1996, 2000, 2001, 2002 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/sb_bdiv_q.c b/gmp/mpn/generic/sb_bdiv_q.c
new file mode 100644
index 0000000000..474c804d48
--- /dev/null
+++ b/gmp/mpn/generic/sb_bdiv_q.c
@@ -0,0 +1,91 @@
+/* mpn_sb_bdiv_q -- schoolbook Hensel division with precomputed inverse,
+ returning quotient only.
+
+ Contributed to the GNU project by Niels Möller.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+ IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005, 2006 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Computes Q = N / D mod B^nn, destroys N.
+ Clobbers N.
+
+ D must be odd. dinv is (-D)^-1 mod B.
+
+
+ The straightforward way to compute Q is to cancel one limb at a time, using
+
+ qp[i] = D^{-1} * np[i] (mod B)
+ N -= B^i * qp[i] * D
+
+ But we prefer addition to subtraction, since mpn_addmul_1 is often faster
+ than mpn_submul_1. Q = - N / D can be computed by iterating
+
+ qp[i] = (-D)^{-1} * np[i] (mod B)
+ N += B^i * qp[i] * D
+
+ And then we flip the sign, -Q = (not Q) + 1.
+*/
+
+void
+mpn_sb_bdiv_q (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_limb_t dinv)
+{
+ mp_size_t i;
+ mp_limb_t qh;
+
+ ASSERT (nn > 0);
+ ASSERT (dn > 0);
+ ASSERT (nn >= dn);
+ ASSERT (dp[0] & 1);
+
+ for (i = 0; i < nn - dn; i++)
+ {
+ mp_limb_t cy;
+ mp_limb_t q;
+
+ q = dinv * np[i];
+ qp[i] = ~q;
+ cy = mpn_addmul_1 (np + i, dp, dn, q);
+ mpn_add_1 (np + i + dn, np + i + dn, nn - i - dn, cy);
+ ASSERT (np[i] == 0);
+ }
+
+ for (; i < nn - 1; i++)
+ {
+ mp_limb_t q;
+
+ q = dinv * np[i];
+ qp[i] = ~q;
+ mpn_addmul_1 (np + i, dp, nn - i, q);
+
+ ASSERT (np[i] == 0);
+ }
+
+ /* Final limb */
+ qp[nn - 1] = ~(dinv * np[nn - 1]);
+ qh = mpn_add_1 (qp, qp, nn, 1); /* FIXME: can we get carry? */
+}
diff --git a/gmp/mpn/generic/sbpi1_bdiv_qr.c b/gmp/mpn/generic/sb_bdiv_qr.c
index 0e56f58148..d1cd0dee32 100644
--- a/gmp/mpn/generic/sbpi1_bdiv_qr.c
+++ b/gmp/mpn/generic/sb_bdiv_qr.c
@@ -1,39 +1,27 @@
-/* mpn_sbpi1_bdiv_qr -- schoolbook Hensel division with precomputed inverse,
+/* mpn_sb_bdiv_qr -- schoolbook Hensel division with precomputed inverse,
returning quotient and remainder.
- Contributed to the GNU project by Niels Möller.
+ Contributed to the GNU project by Niels Möller.
THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
- IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
- ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+ IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
-Copyright 2006, 2009, 2011, 2012 Free Software Foundation, Inc.
+Copyright 2006 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -52,20 +40,19 @@ see https://www.gnu.org/licenses/. */
D must be odd. dinv is (-D)^-1 mod B. */
mp_limb_t
-mpn_sbpi1_bdiv_qr (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+mpn_sb_bdiv_qr (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
{
mp_size_t qn;
mp_size_t i;
mp_limb_t rh;
mp_limb_t ql;
+ ASSERT (nn > 0);
ASSERT (dn > 0);
ASSERT (nn > dn);
- ASSERT ((dp[0] & 1) != 0);
- /* FIXME: Add ASSERTs for allowable overlapping; i.e., that qp = np is OK,
- but some over N/Q overlaps will not work. */
+ ASSERT (dp[0] & 1);
qn = nn - dn;
@@ -80,8 +67,9 @@ mpn_sbpi1_bdiv_qr (mp_ptr qp,
mp_limb_t q;
q = dinv * np[i];
- np[i] = mpn_addmul_1 (np + i, dp, dn, q);
qp[i] = ~q;
+
+ np[i] = mpn_addmul_1 (np + i, dp, dn, q);
}
rh += mpn_add (np + dn, np + dn, qn, np, dn);
ql = mpn_add_1 (qp, qp, dn, ql);
@@ -95,8 +83,9 @@ mpn_sbpi1_bdiv_qr (mp_ptr qp,
mp_limb_t q;
q = dinv * np[i];
- np[i] = mpn_addmul_1 (np + i, dp, dn, q);
qp[i] = ~q;
+
+ np[i] = mpn_addmul_1 (np + i, dp, dn, q);
}
rh += mpn_add_n (np + dn, np + dn, np, qn);
diff --git a/gmp/mpn/generic/sb_div_q.c b/gmp/mpn/generic/sb_div_q.c
new file mode 100644
index 0000000000..609c4ae7f2
--- /dev/null
+++ b/gmp/mpn/generic/sb_div_q.c
@@ -0,0 +1,240 @@
+/* mpn_sb_div_q -- schoolbook division with 2-limb sloppy non-greater
+ precomputed inverse, returning an accurate quotient.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/*
+ CAVEATS:
+ 1. Should it demand normalized operands like now, or normalize on-the-fly?
+ 2. Overwrites {np,nn}.
+ 3. Uses mpn_submul_1. It would be nice to somehow make it use mpn_addmul_1
+ instead. (That would open for mpn_addmul_2 straightforwardly.)
+*/
+
+mp_limb_t
+mpn_sb_div_q (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_srcptr dip)
+{
+ mp_limb_t q, q10, q01a, q00a, q01b, q00b;
+ mp_limb_t cy;
+ mp_size_t i;
+ mp_limb_t qh;
+ mp_limb_t di1, di0;
+ mp_size_t qn;
+
+ mp_size_t dn_orig = dn;
+ mp_srcptr dp_orig = dp;
+ mp_ptr np_orig = np;
+
+ ASSERT (dn > 0);
+ ASSERT (nn >= dn);
+ ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+ ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
+ ASSERT_MPN (np, nn);
+ ASSERT_MPN (dp, dn);
+
+ np += nn;
+ qn = nn - dn;
+ if (qn + 1 < dn)
+ {
+ dp += dn - (qn + 1);
+ dn = qn + 1;
+ }
+
+ qh = mpn_cmp (np - dn, dp, dn) >= 0;
+ if (qh != 0)
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+
+ qp += qn;
+ di1 = dip[1]; di0 = dip[0];
+ for (i = qn; i >= dn; i--)
+ {
+ np--;
+ umul_ppmm (q, q10, np[0], di1);
+ umul_ppmm (q01a, q00a, np[-1], di1);
+ add_ssaaaa (q, q10, q, q10, np[0], q01a);
+ umul_ppmm (q01b, q00b, np[0], di0);
+ add_ssaaaa (q, q10, q, q10, 0, q01b);
+ add_ssaaaa (q, q10, q, q10, 0, np[-1]);
+
+ cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+ if (UNLIKELY (np[0] > cy || mpn_cmp (np - dn, dp, dn) >= 0))
+ {
+ q = q + 1;
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+ }
+
+ *--qp = q;
+ }
+
+ for (i = dn - 1; i > 0; i--)
+ {
+ np--;
+ umul_ppmm (q, q10, np[0], di1);
+ umul_ppmm (q01a, q00a, np[-1], di1);
+ add_ssaaaa (q, q10, q, q10, np[0], q01a);
+ umul_ppmm (q01b, q00b, np[0], di0);
+ add_ssaaaa (q, q10, q, q10, 0, q01b);
+ add_ssaaaa (q, q10, q, q10, 0, np[-1]);
+
+ cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+ if (UNLIKELY (np[0] > cy || mpn_cmp (np - dn, dp, dn) >= 0))
+ {
+ q = q + 1;
+ if (q == 0)
+ q = GMP_NUMB_MAX;
+ else
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+ }
+
+ *--qp = q;
+
+ /* Truncate operands. */
+ dn--;
+ dp++;
+
+ /* The partial remainder might be equal to the truncated divisor,
+ thus non-canonical. When that happens, the rest of the quotient
+ should be all ones. */
+ if (UNLIKELY (mpn_cmp (np - dn, dp, dn) == 0))
+ {
+ while (--i)
+ *--qp = GMP_NUMB_MAX;
+ break;
+ }
+ }
+
+ dn = dn_orig;
+ if (UNLIKELY (np[-1] < dn))
+ {
+ mp_limb_t q, x;
+
+ /* The quotient may be too large if the remainder is small. Recompute
+ for above ignored operand parts, until the remainder spills.
+
+ FIXME: The quality of this code isn't the same as the code above.
+ 1. We don't compute things in an optimal order, high-to-low, in order
+ to terminate as quickly as possible.
+ 2. We mess with pointers and sizes, adding and subtracting and
+ adjusting to get things right. It surely could be streamlined.
+ 3. The only termination criteria are that we determine that the
+ quotient needs to be adjusted, or that we have recomputed
+ everything. We should stop when the remainder is so large
+ that no additional subtracting could make it spill.
+ 4. If nothing else, we should not do two loops of submul_1 over the
+ data, instead handle both the triangularization and chopping at
+ once. */
+
+ x = np[-1];
+
+ if (dn > 2)
+ {
+ /* Compensate for triangularization. */
+ mp_limb_t y;
+
+ dp = dp_orig;
+ if (qn + 1 < dn)
+ {
+ dp += dn - (qn + 1);
+ dn = qn + 1;
+ }
+
+ y = np[-2];
+
+ for (i = dn - 3; i >= 0; i--)
+ {
+ q = qp[i];
+ cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q);
+
+ if (y < cy)
+ {
+ if (x == 0)
+ {
+ cy = mpn_sub_1 (qp, qp, qn, 1);
+ ASSERT_ALWAYS (cy == 0);
+ return qh - cy;
+ }
+ x--;
+ }
+ y -= cy;
+ }
+ np[-2] = y;
+ }
+
+ dn = dn_orig;
+ if (qn + 1 < dn)
+ {
+ /* Compensate for ignored dividend and divisor tails. */
+
+ if (qn == 0)
+ return qh;
+
+ dp = dp_orig;
+ np = np_orig;
+
+ if (qh != 0)
+ {
+ cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1));
+ if (cy != 0)
+ {
+ if (x == 0)
+ {
+ cy = mpn_sub_1 (qp, qp, qn, 1);
+ return qh - cy;
+ }
+ x--;
+ }
+ }
+
+ for (i = dn - qn - 2; i >= 0; i--)
+ {
+ cy = mpn_submul_1 (np + i, qp, qn, dp[i]);
+ cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy);
+ if (cy != 0)
+ {
+ if (x == 0)
+ {
+ cy = mpn_sub_1 (qp, qp, qn, 1);
+ ASSERT_ALWAYS (cy == 0);
+ return qh - cy;
+ }
+ x--;
+ }
+ }
+ }
+ }
+
+ return qh;
+}
diff --git a/gmp/mpn/generic/sb_div_qr.c b/gmp/mpn/generic/sb_div_qr.c
new file mode 100644
index 0000000000..40e4442e21
--- /dev/null
+++ b/gmp/mpn/generic/sb_div_qr.c
@@ -0,0 +1,91 @@
+/* mpn_sb_div_qr -- schoolbook division with 2-limb sloppy non-greater
+ precomputed inverse, returning quotient and remainder.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/*
+ CAVEATS:
+ 1. Should it demand normalized operands like now, or normalize on-the-fly?
+ 2. Overwrites {np,nn} instead of writing remainder to a designated area.
+ 3. Uses mpn_submul_1. It would be nice to somehow make it use mpn_addmul_1
+ instead. (That would open for mpn_addmul_2 straightforwardly.)
+*/
+
+mp_limb_t
+mpn_sb_div_qr (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_srcptr dip)
+{
+ mp_limb_t q, q10, q01a, q00a, q01b, q00b;
+ mp_limb_t cy;
+ mp_size_t i;
+ mp_limb_t qh;
+ mp_limb_t di1, di0;
+
+ ASSERT (dn > 0);
+ ASSERT (nn >= dn);
+ ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+ ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
+ ASSERT_MPN (np, nn);
+ ASSERT_MPN (dp, dn);
+
+ np += nn;
+
+ qh = mpn_cmp (np - dn, dp, dn) >= 0;
+ if (qh != 0)
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+
+ qp += nn - dn;
+ di1 = dip[1]; di0 = dip[0];
+ for (i = nn - dn; i > 0; i--)
+ {
+ np--;
+ umul_ppmm (q, q10, np[0], di1);
+ umul_ppmm (q01a, q00a, np[-1], di1);
+ add_ssaaaa (q, q10, q, q10, np[0], q01a);
+ umul_ppmm (q01b, q00b, np[0], di0);
+ add_ssaaaa (q, q10, q, q10, 0, q01b);
+ add_ssaaaa (q, q10, q, q10, 0, np[-1]);
+
+ cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+ if (UNLIKELY (np[0] > cy || mpn_cmp (np - dn, dp, dn) >= 0))
+ {
+ q = q + 1;
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+ }
+
+ *--qp = q;
+ }
+
+ return qh;
+}
diff --git a/gmp/mpn/generic/sb_divappr_q.c b/gmp/mpn/generic/sb_divappr_q.c
new file mode 100644
index 0000000000..42a39be009
--- /dev/null
+++ b/gmp/mpn/generic/sb_divappr_q.c
@@ -0,0 +1,136 @@
+/* mpn_sb_divappr_q -- schoolbook division with 2-limb sloppy non-greater
+ precomputed inverse, returning approximate quotient.
+
+ Contributed to the GNU project by Torbjörn Granlund.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS
+ ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
+ ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+ RELEASE.
+
+Copyright 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/*
+ CAVEATS:
+ 1. Should it demand normalized operands like now, or normalize on-the-fly?
+ 2. Overwrites {np,nn}.
+ 3. Uses mpn_submul_1. It would be nice to somehow make it use mpn_addmul_1
+ instead. (That would open for mpn_addmul_2 straightforwardly.)
+*/
+
+mp_limb_t
+mpn_sb_divappr_q (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn,
+ mp_srcptr dip)
+{
+ mp_limb_t q, q10, q01a, q00a, q01b, q00b;
+ mp_limb_t cy;
+ mp_size_t i;
+ mp_limb_t qh;
+ mp_limb_t di1, di0;
+ mp_size_t qn;
+
+ ASSERT (dn > 0);
+ ASSERT (nn >= dn);
+ ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+ ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
+ ASSERT_MPN (np, nn);
+ ASSERT_MPN (dp, dn);
+
+ np += nn;
+ qn = nn - dn;
+ if (qn + 1 < dn)
+ {
+ dp += dn - (qn + 1);
+ dn = qn + 1;
+ }
+
+ qh = mpn_cmp (np - dn, dp, dn) >= 0;
+ if (qh != 0)
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+
+ qp += qn;
+ di1 = dip[1]; di0 = dip[0];
+ for (i = qn; i >= dn; i--)
+ {
+ np--;
+ umul_ppmm (q, q10, np[0], di1);
+ umul_ppmm (q01a, q00a, np[-1], di1);
+ add_ssaaaa (q, q10, q, q10, np[0], q01a);
+ umul_ppmm (q01b, q00b, np[0], di0);
+ add_ssaaaa (q, q10, q, q10, 0, q01b);
+ add_ssaaaa (q, q10, q, q10, 0, np[-1]);
+
+ cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+ if (UNLIKELY (np[0] > cy || mpn_cmp (np - dn, dp, dn) >= 0))
+ {
+ q = q + 1;
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+ }
+
+ *--qp = q;
+ }
+
+ for (i = dn - 1; i > 0; i--)
+ {
+ np--;
+ umul_ppmm (q, q10, np[0], di1);
+ umul_ppmm (q01a, q00a, np[-1], di1);
+ add_ssaaaa (q, q10, q, q10, np[0], q01a);
+ umul_ppmm (q01b, q00b, np[0], di0);
+ add_ssaaaa (q, q10, q, q10, 0, q01b);
+ add_ssaaaa (q, q10, q, q10, 0, np[-1]);
+
+ cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+ if (UNLIKELY (np[0] > cy || mpn_cmp (np - dn, dp, dn) >= 0))
+ {
+ q = q + 1;
+ if (q == 0)
+ q = GMP_NUMB_MAX;
+ else
+ mpn_sub_n (np - dn, np - dn, dp, dn);
+ }
+
+ *--qp = q;
+
+ /* Truncate operands. */
+ dn--;
+ dp++;
+
+ /* The partial remainder might be equal to the truncated divisor,
+ thus non-canonical. When that happens, the rest of the quotient
+ should be all ones. */
+ if (UNLIKELY (mpn_cmp (np - dn, dp, dn) == 0))
+ {
+ while (--i)
+ *--qp = GMP_NUMB_MAX;
+ break;
+ }
+ }
+
+ return qh;
+}
diff --git a/gmp/mpn/generic/sb_divrem_mn.c b/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..06e2f4ca0d
--- /dev/null
+++ b/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,205 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+ quotient.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+ INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+ IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+ FUTURE GNU MP RELEASE.
+
+
+Copyright 1993, 1994, 1995, 1996, 2000, 2001, 2002 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd,
+ meaning the quotient size where that should happen, the quotient size
+ being how many udiv divisions will be done.
+
+ The default is to use preinv always, CPUs where this doesn't suit have
+ tuned thresholds. Note in particular that preinv should certainly be
+ used if that's the only division available (USE_PREINV_ALWAYS). */
+
+#ifndef DIV_SB_PREINV_THRESHOLD
+#define DIV_SB_PREINV_THRESHOLD 0
+#endif
+
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+ the NSIZE-DSIZE least significant quotient limbs at QP
+ and the DSIZE long remainder at NP.
+ Return the most significant limb of the quotient, this is always 0 or 1.
+
+ Preconditions:
+ 0. NSIZE >= DSIZE.
+ 1. The most significant bit of the divisor must be set.
+ 2. QP must either not overlap with the input operands at all, or
+ QP + DSIZE >= NP must hold true. (This means that it's
+ possible to put the quotient in the high part of NUM, right after the
+ remainder in NUM.
+ 3. NSIZE >= DSIZE.
+ 4. DSIZE > 2. */
+
+
+mp_limb_t
+mpn_sb_divrem_mn (mp_ptr qp,
+ mp_ptr np, mp_size_t nn,
+ mp_srcptr dp, mp_size_t dn)
+{
+ mp_limb_t most_significant_q_limb = 0;
+ mp_size_t qn = nn - dn;
+ mp_size_t i;
+ mp_limb_t dx, d1, n0;
+ mp_limb_t dxinv;
+ int use_preinv;
+
+ ASSERT (dn > 2);
+ ASSERT (nn >= dn);
+ ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
+ ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
+ ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
+ ASSERT_MPN (np, nn);
+ ASSERT_MPN (dp, dn);
+
+ np += qn;
+ dx = dp[dn - 1];
+ d1 = dp[dn - 2];
+ n0 = np[dn - 1];
+
+ if (n0 >= dx)
+ {
+ if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0)
+ {
+ mpn_sub_n (np, np, dp, dn);
+ most_significant_q_limb = 1;
+ }
+ }
+
+ use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD);
+ if (use_preinv)
+ invert_limb (dxinv, dx);
+
+ for (i = qn - 1; i >= 0; i--)
+ {
+ mp_limb_t q;
+ mp_limb_t nx;
+ mp_limb_t cy_limb;
+
+ nx = np[dn - 1]; /* FIXME: could get value from r1 */
+ np--;
+
+ if (nx == dx)
+ {
+ /* This might over-estimate q, but it's probably not worth
+ the extra code here to find out. */
+ q = GMP_NUMB_MASK;
+
+#if 1
+ cy_limb = mpn_submul_1 (np, dp, dn, q);
+#else
+ /* This should be faster on many machines */
+ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn);
+ cy = mpn_add_n (np, np, dp, dn);
+ np[dn] += cy;
+#endif
+
+ if (nx != cy_limb)
+ {
+ mpn_add_n (np, np, dp, dn);
+ q--;
+ }
+
+ qp[i] = q;
+ }
+ else
+ {
+ mp_limb_t rx, r1, r0, p1, p0;
+
+ /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage
+ when np[dn-1] is used in an asm statement like umul_ppmm in
+ udiv_qrnnd_preinv. The symptom is seg faults due to registers
+ being clobbered. gcc 2.95 i386 doesn't have the problem. */
+ {
+ mp_limb_t workaround = np[dn - 1];
+ if (CACHED_ABOVE_THRESHOLD (use_preinv, DIV_SB_PREINV_THRESHOLD))
+ udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+ else
+ {
+ udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS,
+ dx << GMP_NAIL_BITS);
+ r1 >>= GMP_NAIL_BITS;
+ }
+ }
+ umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS);
+ p0 >>= GMP_NAIL_BITS;
+
+ r0 = np[dn - 2];
+ rx = 0;
+ if (r1 < p1 || (r1 == p1 && r0 < p0))
+ {
+ p1 -= p0 < d1;
+ p0 = (p0 - d1) & GMP_NUMB_MASK;
+ q--;
+ r1 = (r1 + dx) & GMP_NUMB_MASK;
+ rx = r1 < dx;
+ }
+
+ p1 += r0 < p0; /* cannot carry! */
+ rx -= r1 < p1; /* may become 11..1 if q is still too large */
+ r1 = (r1 - p1) & GMP_NUMB_MASK;
+ r0 = (r0 - p0) & GMP_NUMB_MASK;
+
+ cy_limb = mpn_submul_1 (np, dp, dn - 2, q);
+
+ /* Check if we've over-estimated q, and adjust as needed. */
+ {
+ mp_limb_t cy1, cy2;
+ cy1 = r0 < cy_limb;
+ r0 = (r0 - cy_limb) & GMP_NUMB_MASK;
+ cy2 = r1 < cy1;
+ r1 -= cy1;
+ np[dn - 1] = r1;
+ np[dn - 2] = r0;
+ if (cy2 != rx)
+ {
+ mpn_add_n (np, np, dp, dn);
+ q--;
+ }
+ }
+ qp[i] = q;
+ }
+ }
+
+ /* ______ ______ ______
+ |__rx__|__r1__|__r0__| partial remainder
+ ______ ______
+ - |__p1__|__p0__| partial product to subtract
+ ______ ______
+ - |______|cylimb|
+
+ rx is -1, 0 or 1. If rx=1, then q is correct (it should match
+ carry out). If rx=-1 then q is too large. If rx=0, then q might
+ be too large, but it is most likely correct.
+ */
+
+ return most_significant_q_limb;
+}
diff --git a/gmp/mpn/generic/sbpi1_bdiv_q.c b/gmp/mpn/generic/sbpi1_bdiv_q.c
deleted file mode 100644
index 645b1d9b6a..0000000000
--- a/gmp/mpn/generic/sbpi1_bdiv_q.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/* mpn_sbpi1_bdiv_q -- schoolbook Hensel division with precomputed inverse,
- returning quotient only.
-
- Contributed to the GNU project by Niels Möller.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
- IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS
- ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-/* Computes Q = N / D mod B^nn, destroys N.
-
- D must be odd. dinv is (-D)^-1 mod B.
-
-
- The straightforward way to compute Q is to cancel one limb at a time, using
-
- qp[i] = D^{-1} * np[i] (mod B)
- N -= B^i * qp[i] * D
-
- But we prefer addition to subtraction, since mpn_addmul_1 is often faster
- than mpn_submul_1. Q = - N / D can be computed by iterating
-
- qp[i] = (-D)^{-1} * np[i] (mod B)
- N += B^i * qp[i] * D
-
- And then we flip the sign, -Q = (not Q) + 1. */
-
-void
-mpn_sbpi1_bdiv_q (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv)
-{
- mp_size_t i;
- mp_limb_t cy, q;
-
- ASSERT (dn > 0);
- ASSERT (nn >= dn);
- ASSERT ((dp[0] & 1) != 0);
- /* FIXME: Add ASSERTs for allowable overlapping; i.e., that qp = np is OK,
- but some over N/Q overlaps will not work. */
-
- for (i = nn - dn; i > 0; i--)
- {
- q = dinv * np[0];
- cy = mpn_addmul_1 (np, dp, dn, q);
- mpn_add_1 (np + dn, np + dn, i, cy);
- ASSERT (np[0] == 0);
- qp[0] = ~q;
- qp++;
- np++;
- }
-
- for (i = dn; i > 1; i--)
- {
- q = dinv * np[0];
- mpn_addmul_1 (np, dp, i, q);
- ASSERT (np[0] == 0);
- qp[0] = ~q;
- qp++;
- np++;
- }
-
- /* Final limb */
- q = dinv * np[0];
- qp[0] = ~q;
- mpn_add_1 (qp - nn + 1, qp - nn + 1, nn, 1);
-}
diff --git a/gmp/mpn/generic/sbpi1_div_q.c b/gmp/mpn/generic/sbpi1_div_q.c
deleted file mode 100644
index 3abbd57933..0000000000
--- a/gmp/mpn/generic/sbpi1_div_q.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/* mpn_sbpi1_div_q -- Schoolbook division using the Möller-Granlund 3/2
- division algorithm.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2007, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-mp_limb_t
-mpn_sbpi1_div_q (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv)
-{
- mp_limb_t qh;
- mp_size_t qn, i;
- mp_limb_t n1, n0;
- mp_limb_t d1, d0;
- mp_limb_t cy, cy1;
- mp_limb_t q;
- mp_limb_t flag;
-
- mp_size_t dn_orig = dn;
- mp_srcptr dp_orig = dp;
- mp_ptr np_orig = np;
-
- ASSERT (dn > 2);
- ASSERT (nn >= dn);
- ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
-
- np += nn;
-
- qn = nn - dn;
- if (qn + 1 < dn)
- {
- dp += dn - (qn + 1);
- dn = qn + 1;
- }
-
- qh = mpn_cmp (np - dn, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (np - dn, np - dn, dp, dn);
-
- qp += qn;
-
- dn -= 2; /* offset dn by 2 for main division loops,
- saving two iterations in mpn_submul_1. */
- d1 = dp[dn + 1];
- d0 = dp[dn + 0];
-
- np -= 2;
-
- n1 = np[1];
-
- for (i = qn - (dn + 2); i >= 0; i--)
- {
- np--;
- if (UNLIKELY (n1 == d1) && np[1] == d0)
- {
- q = GMP_NUMB_MASK;
- mpn_submul_1 (np - dn, dp, dn + 2, q);
- n1 = np[1]; /* update n1, last loop's value will now be invalid */
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- cy = mpn_submul_1 (np - dn, dp, dn, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 -= cy1;
- np[0] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
- q--;
- }
- }
-
- *--qp = q;
- }
-
- flag = ~CNST_LIMB(0);
-
- if (dn >= 0)
- {
- for (i = dn; i > 0; i--)
- {
- np--;
- if (UNLIKELY (n1 >= (d1 & flag)))
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np - dn, dp, dn + 2, q);
-
- if (UNLIKELY (n1 != cy))
- {
- if (n1 < (cy & flag))
- {
- q--;
- mpn_add_n (np - dn, np - dn, dp, dn + 2);
- }
- else
- flag = 0;
- }
- n1 = np[1];
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- cy = mpn_submul_1 (np - dn, dp, dn, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 -= cy1;
- np[0] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
- q--;
- }
- }
-
- *--qp = q;
-
- /* Truncate operands. */
- dn--;
- dp++;
- }
-
- np--;
- if (UNLIKELY (n1 >= (d1 & flag)))
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np, dp, 2, q);
-
- if (UNLIKELY (n1 != cy))
- {
- if (n1 < (cy & flag))
- {
- q--;
- add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]);
- }
- else
- flag = 0;
- }
- n1 = np[1];
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- np[0] = n0;
- np[1] = n1;
- }
-
- *--qp = q;
- }
- ASSERT_ALWAYS (np[1] == n1);
- np += 2;
-
-
- dn = dn_orig;
- if (UNLIKELY (n1 < (dn & flag)))
- {
- mp_limb_t q, x;
-
- /* The quotient may be too large if the remainder is small. Recompute
- for above ignored operand parts, until the remainder spills.
-
- FIXME: The quality of this code isn't the same as the code above.
- 1. We don't compute things in an optimal order, high-to-low, in order
- to terminate as quickly as possible.
- 2. We mess with pointers and sizes, adding and subtracting and
- adjusting to get things right. It surely could be streamlined.
- 3. The only termination criteria are that we determine that the
- quotient needs to be adjusted, or that we have recomputed
- everything. We should stop when the remainder is so large
- that no additional subtracting could make it spill.
- 4. If nothing else, we should not do two loops of submul_1 over the
- data, instead handle both the triangularization and chopping at
- once. */
-
- x = n1;
-
- if (dn > 2)
- {
- /* Compensate for triangularization. */
- mp_limb_t y;
-
- dp = dp_orig;
- if (qn + 1 < dn)
- {
- dp += dn - (qn + 1);
- dn = qn + 1;
- }
-
- y = np[-2];
-
- for (i = dn - 3; i >= 0; i--)
- {
- q = qp[i];
- cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q);
-
- if (y < cy)
- {
- if (x == 0)
- {
- cy = mpn_sub_1 (qp, qp, qn, 1);
- ASSERT_ALWAYS (cy == 0);
- return qh - cy;
- }
- x--;
- }
- y -= cy;
- }
- np[-2] = y;
- }
-
- dn = dn_orig;
- if (qn + 1 < dn)
- {
- /* Compensate for ignored dividend and divisor tails. */
-
- dp = dp_orig;
- np = np_orig;
-
- if (qh != 0)
- {
- cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1));
- if (cy != 0)
- {
- if (x == 0)
- {
- if (qn != 0)
- cy = mpn_sub_1 (qp, qp, qn, 1);
- return qh - cy;
- }
- x--;
- }
- }
-
- if (qn == 0)
- return qh;
-
- for (i = dn - qn - 2; i >= 0; i--)
- {
- cy = mpn_submul_1 (np + i, qp, qn, dp[i]);
- cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy);
- if (cy != 0)
- {
- if (x == 0)
- {
- cy = mpn_sub_1 (qp, qp, qn, 1);
- return qh;
- }
- x--;
- }
- }
- }
- }
-
- return qh;
-}
diff --git a/gmp/mpn/generic/sbpi1_div_qr.c b/gmp/mpn/generic/sbpi1_div_qr.c
deleted file mode 100644
index 0c3e4cb729..0000000000
--- a/gmp/mpn/generic/sbpi1_div_qr.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/* mpn_sbpi1_div_qr -- Schoolbook division using the Möller-Granlund 3/2
- division algorithm.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2007, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-mp_limb_t
-mpn_sbpi1_div_qr (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv)
-{
- mp_limb_t qh;
- mp_size_t i;
- mp_limb_t n1, n0;
- mp_limb_t d1, d0;
- mp_limb_t cy, cy1;
- mp_limb_t q;
-
- ASSERT (dn > 2);
- ASSERT (nn >= dn);
- ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
-
- np += nn;
-
- qh = mpn_cmp (np - dn, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (np - dn, np - dn, dp, dn);
-
- qp += nn - dn;
-
- dn -= 2; /* offset dn by 2 for main division loops,
- saving two iterations in mpn_submul_1. */
- d1 = dp[dn + 1];
- d0 = dp[dn + 0];
-
- np -= 2;
-
- n1 = np[1];
-
- for (i = nn - (dn + 2); i > 0; i--)
- {
- np--;
- if (UNLIKELY (n1 == d1) && np[1] == d0)
- {
- q = GMP_NUMB_MASK;
- mpn_submul_1 (np - dn, dp, dn + 2, q);
- n1 = np[1]; /* update n1, last loop's value will now be invalid */
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- cy = mpn_submul_1 (np - dn, dp, dn, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 = (n1 - cy1) & GMP_NUMB_MASK;
- np[0] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
- q--;
- }
- }
-
- *--qp = q;
- }
- np[1] = n1;
-
- return qh;
-}
diff --git a/gmp/mpn/generic/sbpi1_divappr_q.c b/gmp/mpn/generic/sbpi1_divappr_q.c
deleted file mode 100644
index 3e7cf91ba6..0000000000
--- a/gmp/mpn/generic/sbpi1_divappr_q.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/* mpn_sbpi1_divappr_q -- Schoolbook division using the Möller-Granlund 3/2
- division algorithm, returning approximate quotient. The quotient returned
- is either correct, or one too large.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
-
-Copyright 2007, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-mp_limb_t
-mpn_sbpi1_divappr_q (mp_ptr qp,
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv)
-{
- mp_limb_t qh;
- mp_size_t qn, i;
- mp_limb_t n1, n0;
- mp_limb_t d1, d0;
- mp_limb_t cy, cy1;
- mp_limb_t q;
- mp_limb_t flag;
-
- ASSERT (dn > 2);
- ASSERT (nn >= dn);
- ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
-
- np += nn;
-
- qn = nn - dn;
- if (qn + 1 < dn)
- {
- dp += dn - (qn + 1);
- dn = qn + 1;
- }
-
- qh = mpn_cmp (np - dn, dp, dn) >= 0;
- if (qh != 0)
- mpn_sub_n (np - dn, np - dn, dp, dn);
-
- qp += qn;
-
- dn -= 2; /* offset dn by 2 for main division loops,
- saving two iterations in mpn_submul_1. */
- d1 = dp[dn + 1];
- d0 = dp[dn + 0];
-
- np -= 2;
-
- n1 = np[1];
-
- for (i = qn - (dn + 2); i >= 0; i--)
- {
- np--;
- if (UNLIKELY (n1 == d1) && np[1] == d0)
- {
- q = GMP_NUMB_MASK;
- mpn_submul_1 (np - dn, dp, dn + 2, q);
- n1 = np[1]; /* update n1, last loop's value will now be invalid */
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- cy = mpn_submul_1 (np - dn, dp, dn, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 -= cy1;
- np[0] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
- q--;
- }
- }
-
- *--qp = q;
- }
-
- flag = ~CNST_LIMB(0);
-
- if (dn >= 0)
- {
- for (i = dn; i > 0; i--)
- {
- np--;
- if (UNLIKELY (n1 >= (d1 & flag)))
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np - dn, dp, dn + 2, q);
-
- if (UNLIKELY (n1 != cy))
- {
- if (n1 < (cy & flag))
- {
- q--;
- mpn_add_n (np - dn, np - dn, dp, dn + 2);
- }
- else
- flag = 0;
- }
- n1 = np[1];
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- cy = mpn_submul_1 (np - dn, dp, dn, q);
-
- cy1 = n0 < cy;
- n0 = (n0 - cy) & GMP_NUMB_MASK;
- cy = n1 < cy1;
- n1 -= cy1;
- np[0] = n0;
-
- if (UNLIKELY (cy != 0))
- {
- n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
- q--;
- }
- }
-
- *--qp = q;
-
- /* Truncate operands. */
- dn--;
- dp++;
- }
-
- np--;
- if (UNLIKELY (n1 >= (d1 & flag)))
- {
- q = GMP_NUMB_MASK;
- cy = mpn_submul_1 (np, dp, 2, q);
-
- if (UNLIKELY (n1 != cy))
- {
- if (n1 < (cy & flag))
- {
- q--;
- add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]);
- }
- else
- flag = 0;
- }
- n1 = np[1];
- }
- else
- {
- udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
-
- np[1] = n1;
- np[0] = n0;
- }
-
- *--qp = q;
- }
-
- ASSERT_ALWAYS (np[1] == n1);
-
- return qh;
-}
diff --git a/gmp/mpn/generic/scan0.c b/gmp/mpn/generic/scan0.c
index 8171fd5afe..2e9f3a43da 100644
--- a/gmp/mpn/generic/scan0.c
+++ b/gmp/mpn/generic/scan0.c
@@ -5,28 +5,17 @@ Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -36,8 +25,9 @@ see https://www.gnu.org/licenses/. */
1. U must sooner or later have a limb with a clear bit.
*/
-mp_bitcnt_t
-mpn_scan0 (mp_srcptr up, mp_bitcnt_t starting_bit)
+unsigned long int
+mpn_scan0 (register mp_srcptr up,
+ register unsigned long int starting_bit)
{
mp_size_t starting_word;
mp_limb_t alimb;
diff --git a/gmp/mpn/generic/scan1.c b/gmp/mpn/generic/scan1.c
index e22ad5d827..d0d9a3feea 100644
--- a/gmp/mpn/generic/scan1.c
+++ b/gmp/mpn/generic/scan1.c
@@ -5,28 +5,17 @@ Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -36,8 +25,9 @@ see https://www.gnu.org/licenses/. */
1. U must sooner or later have a limb != 0.
*/
-mp_bitcnt_t
-mpn_scan1 (mp_srcptr up, mp_bitcnt_t starting_bit)
+unsigned long int
+mpn_scan1 (register mp_srcptr up,
+ register unsigned long int starting_bit)
{
mp_size_t starting_word;
mp_limb_t alimb;
diff --git a/gmp/mpn/generic/sec_aors_1.c b/gmp/mpn/generic/sec_aors_1.c
deleted file mode 100644
index d789a5792e..0000000000
--- a/gmp/mpn/generic/sec_aors_1.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mpn_sec_add_1, mpn_sec_sub_1
-
- Contributed to the GNU project by Niels Möller
-
-Copyright 2013, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if OPERATION_sec_add_1
-#define FNAME mpn_sec_add_1
-#define FNAME_itch mpn_sec_add_1_itch
-#define OP_N mpn_add_n
-#endif
-#if OPERATION_sec_sub_1
-#define FNAME mpn_sec_sub_1
-#define FNAME_itch mpn_sec_sub_1_itch
-#define OP_N mpn_sub_n
-#endif
-
-/* It's annoying to that we need scratch space */
-mp_size_t
-FNAME_itch (mp_size_t n)
-{
- return n;
-}
-
-mp_limb_t
-FNAME (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_ptr scratch)
-{
- scratch[0] = b;
- MPN_ZERO (scratch + 1, n-1);
- return OP_N (rp, ap, scratch, n);
-}
diff --git a/gmp/mpn/generic/sec_div.c b/gmp/mpn/generic/sec_div.c
deleted file mode 100644
index 483b118d0d..0000000000
--- a/gmp/mpn/generic/sec_div.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/* mpn_sec_div_qr, mpn_sec_div_r -- Compute Q = floor(U / V), U = U mod V.
- Side-channel silent under the assumption that the used instructions are
- side-channel silent.
-
- Contributed to the GNU project by Torbjörn Granlund.
-
-Copyright 2011-2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#if OPERATION_sec_div_qr
-#define FNAME mpn_sec_div_qr
-#define FNAME_itch mpn_sec_div_qr_itch
-#define Q(q) q,
-#define RETTYPE mp_limb_t
-#endif
-#if OPERATION_sec_div_r
-#define FNAME mpn_sec_div_r
-#define FNAME_itch mpn_sec_div_r_itch
-#define Q(q)
-#define RETTYPE void
-#endif
-
-mp_size_t
-FNAME_itch (mp_size_t nn, mp_size_t dn)
-{
-#if OPERATION_sec_div_qr
-/* Needs (nn + dn + 1) + mpn_sec_pi1_div_qr's needs of (2nn' - dn + 1) for a
- total of 3nn + 4 limbs at tp. Note that mpn_sec_pi1_div_qr's nn is one
- greater than ours, therefore +4 and not just +2. */
- return 3 * nn + 4;
-#endif
-#if OPERATION_sec_div_r
-/* Needs (nn + dn + 1) + mpn_sec_pi1_div_r's needs of (dn + 1) for a total of
- nn + 2dn + 2 limbs at tp. */
- return nn + 2 * dn + 2;
-#endif
-}
-
-RETTYPE
-FNAME (Q(mp_ptr qp)
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_ptr tp)
-{
- mp_limb_t d1, d0;
- unsigned int cnt;
- gmp_pi1_t dinv;
- mp_limb_t inv32;
-
- ASSERT (dn >= 1);
- ASSERT (nn >= dn);
- ASSERT (dp[dn - 1] != 0);
-
- d1 = dp[dn - 1];
- count_leading_zeros (cnt, d1);
-
- if (cnt != 0)
- {
- mp_limb_t qh, cy;
- mp_ptr np2, dp2;
- dp2 = tp; /* dn limbs */
- mpn_lshift (dp2, dp, dn, cnt);
-
- np2 = tp + dn; /* (nn + 1) limbs */
- cy = mpn_lshift (np2, np, nn, cnt);
- np2[nn++] = cy;
-
- d0 = dp2[dn - 1];
- d0 += (~d0 != 0);
- invert_limb (inv32, d0);
-
- /* We add nn + dn to tp here, not nn + 1 + dn, as expected. This is
- since nn here will have been incremented. */
-#if OPERATION_sec_div_qr
- qh = mpn_sec_pi1_div_qr (np2 + dn, np2, nn, dp2, dn, inv32, tp + nn + dn);
- ASSERT (qh == 0); /* FIXME: this indicates inefficiency! */
- MPN_COPY (qp, np2 + dn, nn - dn - 1);
- qh = np2[nn - 1];
-#else
- mpn_sec_pi1_div_r (np2, nn, dp2, dn, inv32, tp + nn + dn);
-#endif
-
- mpn_rshift (np, np2, dn, cnt);
-
-#if OPERATION_sec_div_qr
- return qh;
-#endif
- }
- else
- {
- /* FIXME: Consider copying np => np2 here, adding a 0-limb at the top.
- That would simplify the underlying pi1 function, since then it could
- assume nn > dn. */
- d0 = dp[dn - 1];
- d0 += (~d0 != 0);
- invert_limb (inv32, d0);
-
-#if OPERATION_sec_div_qr
- return mpn_sec_pi1_div_qr (qp, np, nn, dp, dn, inv32, tp);
-#else
- mpn_sec_pi1_div_r (np, nn, dp, dn, inv32, tp);
-#endif
- }
-}
diff --git a/gmp/mpn/generic/sec_invert.c b/gmp/mpn/generic/sec_invert.c
deleted file mode 100644
index 43a578b2a1..0000000000
--- a/gmp/mpn/generic/sec_invert.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/* mpn_sec_invert
-
- Contributed to the GNU project by Niels Möller
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if 0
-/* Currently unused. Should be resurrected once mpn_cnd_neg is
- advertised. */
-static mp_size_t
-mpn_cnd_neg_itch (mp_size_t n)
-{
- return n;
-}
-#endif
-
-/* FIXME: Ought to return carry */
-static void
-mpn_cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n,
- mp_ptr scratch)
-{
- mpn_lshift (scratch, ap, n, 1);
- mpn_cnd_sub_n (cnd, rp, ap, scratch, n);
-}
-
-static void
-mpn_cnd_swap (int cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp,
- mp_size_t n)
-{
- volatile mp_limb_t mask = - (mp_limb_t) (cnd != 0);
- mp_size_t i;
- for (i = 0; i < n; i++)
- {
- mp_limb_t a, b, t;
- a = ap[i];
- b = bp[i];
- t = (a ^ b) & mask;
- ap[i] = a ^ t;
- bp[i] = b ^ t;
- }
-}
-
-static int
-mpn_sec_eq_ui (mp_srcptr ap, mp_size_t n, mp_limb_t b)
-{
- mp_limb_t d;
- ASSERT (n > 0);
-
- d = ap[0] ^ b;
-
- while (--n > 0)
- d |= ap[n];
-
- return d == 0;
-}
-
-mp_size_t
-mpn_sec_invert_itch (mp_size_t n)
-{
- return 4*n;
-}
-
-/* Compute V <-- A^{-1} (mod M), in data-independent time. M must be
- odd. Returns 1 on success, and 0 on failure (i.e., if gcd (A, m) !=
- 1). Inputs and outputs of size n, and no overlap allowed. The {ap,
- n} area is destroyed. For arbitrary inputs, bit_size should be
- 2*n*GMP_NUMB_BITS, but if A or M are known to be smaller, e.g., if
- M = 2^521 - 1 and A < M, bit_size can be any bound on the sum of
- the bit sizes of A and M. */
-int
-mpn_sec_invert (mp_ptr vp, mp_ptr ap, mp_srcptr mp,
- mp_size_t n, mp_bitcnt_t bit_size,
- mp_ptr scratch)
-{
- ASSERT (n > 0);
- ASSERT (bit_size > 0);
- ASSERT (mp[0] & 1);
- ASSERT (! MPN_OVERLAP_P (ap, n, vp, n));
-#define bp (scratch + n)
-#define up (scratch + 2*n)
-#define m1hp (scratch + 3*n)
-
- /* Maintain
-
- a = u * orig_a (mod m)
- b = v * orig_a (mod m)
-
- and b odd at all times. Initially,
-
- a = a_orig, u = 1
- b = m, v = 0
- */
-
-
- up[0] = 1;
- mpn_zero (up+1, n - 1);
- mpn_copyi (bp, mp, n);
- mpn_zero (vp, n);
-
- ASSERT_CARRY (mpn_rshift (m1hp, mp, n, 1));
- ASSERT_NOCARRY (mpn_sec_add_1 (m1hp, m1hp, n, 1, scratch));
-
- while (bit_size-- > 0)
- {
- mp_limb_t odd, swap, cy;
-
- /* Always maintain b odd. The logic of the iteration is as
- follows. For a, b:
-
- odd = a & 1
- a -= odd * b
- if (underflow from a-b)
- {
- b += a, assigns old a
- a = B^n-a
- }
-
- a /= 2
-
- For u, v:
-
- if (underflow from a - b)
- swap u, v
- u -= odd * v
- if (underflow from u - v)
- u += m
-
- u /= 2
- if (a one bit was shifted out)
- u += (m+1)/2
-
- As long as a > 0, the quantity
-
- (bitsize of a) + (bitsize of b)
-
- is reduced by at least one bit per iteration, hence after (bit_size of
- orig_a) + (bit_size of m) - 1 iterations we surely have a = 0. Then b
- = gcd(orig_a, m) and if b = 1 then also v = orig_a^{-1} (mod m).
- */
-
- ASSERT (bp[0] & 1);
- odd = ap[0] & 1;
-
- swap = mpn_cnd_sub_n (odd, ap, ap, bp, n);
- mpn_cnd_add_n (swap, bp, bp, ap, n);
- mpn_cnd_neg (swap, ap, ap, n, scratch);
-
- mpn_cnd_swap (swap, up, vp, n);
- cy = mpn_cnd_sub_n (odd, up, up, vp, n);
- cy -= mpn_cnd_add_n (cy, up, up, mp, n);
- ASSERT (cy == 0);
-
- cy = mpn_rshift (ap, ap, n, 1);
- ASSERT (cy == 0);
- cy = mpn_rshift (up, up, n, 1);
- cy = mpn_cnd_add_n (cy, up, up, m1hp, n);
- ASSERT (cy == 0);
- }
- /* Should be all zeros, but check only extreme limbs */
- ASSERT ( (ap[0] | ap[n-1]) == 0);
- /* Check if indeed gcd == 1. */
- return mpn_sec_eq_ui (bp, n, 1);
-#undef bp
-#undef up
-#undef m1hp
-}
diff --git a/gmp/mpn/generic/sec_mul.c b/gmp/mpn/generic/sec_mul.c
deleted file mode 100644
index 2cd87fab1d..0000000000
--- a/gmp/mpn/generic/sec_mul.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* mpn_sec_mul.
-
- Contributed to the GNU project by Torbjörn Granlund.
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_sec_mul (mp_ptr rp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn,
- mp_ptr tp)
-{
- mpn_mul_basecase (rp, ap, an, bp, bn);
-}
-
-mp_size_t
-mpn_sec_mul_itch (mp_size_t an, mp_size_t bn)
-{
- return 0;
-}
diff --git a/gmp/mpn/generic/sec_pi1_div.c b/gmp/mpn/generic/sec_pi1_div.c
deleted file mode 100644
index 1e075daf73..0000000000
--- a/gmp/mpn/generic/sec_pi1_div.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/* mpn_sec_pi1_div_qr, mpn_sec_pi1_div_r -- Compute Q = floor(U / V), U = U
- mod V. Side-channel silent under the assumption that the used instructions
- are side-channel silent.
-
- Contributed to the GNU project by Torbjörn Granlund.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011-2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* This side-channel silent division algorithm reduces the partial remainder by
- GMP_NUMB_BITS/2 bits at a time, compared to GMP_NUMB_BITS for the main
- division algorithm. We actually do not insist on reducing by exactly
- GMP_NUMB_BITS/2, but may leave a partial remainder that is D*B^i to 3D*B^i
- too large (B is the limb base, D is the divisor, and i is the induction
- variable); the subsequent step will handle the extra partial remainder bits.
-
- With that partial remainder reduction, each step generates a quotient "half
- limb". The outer loop generates two quotient half limbs, an upper (q1h) and
- a lower (q0h) which are stored sparsely in separate limb arrays. These
- arrays are added at the end; using separate arrays avoids data-dependent
- carry propagation which could else pose a side-channel leakage problem.
-
- The quotient half limbs may be between -3 to 0 from the accurate value
- ("accurate" being the one which corresponds to a reduction to a principal
- partial remainder). Too small quotient half limbs correspond to too large
- remainders, which we reduce later, as described above.
-
- In order to keep quotients from getting too big, corresponding to a negative
- partial remainder, we use an inverse which is slightly smaller than usually.
-*/
-
-#if OPERATION_sec_pi1_div_qr
-/* Needs (dn + 1) + (nn - dn) + (nn - dn) = 2nn - dn + 1 limbs at tp. */
-#define FNAME mpn_sec_pi1_div_qr
-#define Q(q) q,
-#define RETTYPE mp_limb_t
-#endif
-#if OPERATION_sec_pi1_div_r
-/* Needs (dn + 1) limbs at tp. */
-#define FNAME mpn_sec_pi1_div_r
-#define Q(q)
-#define RETTYPE void
-#endif
-
-RETTYPE
-FNAME (Q(mp_ptr qp)
- mp_ptr np, mp_size_t nn,
- mp_srcptr dp, mp_size_t dn,
- mp_limb_t dinv,
- mp_ptr tp)
-{
- mp_limb_t nh, cy, q1h, q0h, dummy, cnd;
- mp_size_t i;
- mp_ptr hp;
-#if OPERATION_sec_pi1_div_qr
- mp_limb_t qh;
- mp_ptr qlp, qhp;
-#endif
-
- ASSERT (dn >= 1);
- ASSERT (nn >= dn);
- ASSERT ((dp[dn - 1] & GMP_NUMB_HIGHBIT) != 0);
-
- if (nn == dn)
- {
- cy = mpn_sub_n (np, np, dp, dn);
- mpn_cnd_add_n (cy, np, np, dp, dn);
-#if OPERATION_sec_pi1_div_qr
- return 1 - cy;
-#else
- return;
-#endif
- }
-
- /* Create a divisor copy shifted half a limb. */
- hp = tp; /* (dn + 1) limbs */
- hp[dn] = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2);
-
-#if OPERATION_sec_pi1_div_qr
- qlp = tp + (dn + 1); /* (nn - dn) limbs */
- qhp = tp + (nn + 1); /* (nn - dn) limbs */
-#endif
-
- np += nn - dn;
- nh = 0;
-
- for (i = nn - dn - 1; i >= 0; i--)
- {
- np--;
-
- nh = (nh << GMP_NUMB_BITS/2) + (np[dn] >> GMP_NUMB_BITS/2);
- umul_ppmm (q1h, dummy, nh, dinv);
- q1h += nh;
-#if OPERATION_sec_pi1_div_qr
- qhp[i] = q1h;
-#endif
- mpn_submul_1 (np, hp, dn + 1, q1h);
-
- nh = np[dn];
- umul_ppmm (q0h, dummy, nh, dinv);
- q0h += nh;
-#if OPERATION_sec_pi1_div_qr
- qlp[i] = q0h;
-#endif
- nh -= mpn_submul_1 (np, dp, dn, q0h);
- }
-
- /* 1st adjustment depends on extra high remainder limb. */
- cnd = nh != 0; /* FIXME: cmp-to-int */
-#if OPERATION_sec_pi1_div_qr
- qlp[0] += cnd;
-#endif
- nh -= mpn_cnd_sub_n (cnd, np, np, dp, dn);
-
- /* 2nd adjustment depends on remainder/divisor comparison as well as whether
- extra remainder limb was nullified by previous subtract. */
- cy = mpn_sub_n (np, np, dp, dn);
- cy = cy - nh;
-#if OPERATION_sec_pi1_div_qr
- qlp[0] += 1 - cy;
-#endif
- mpn_cnd_add_n (cy, np, np, dp, dn);
-
- /* 3rd adjustment depends on remainder/divisor comparison. */
- cy = mpn_sub_n (np, np, dp, dn);
-#if OPERATION_sec_pi1_div_qr
- qlp[0] += 1 - cy;
-#endif
- mpn_cnd_add_n (cy, np, np, dp, dn);
-
-#if OPERATION_sec_pi1_div_qr
- /* Combine quotient halves into final quotient. */
- qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2);
- qh += mpn_add_n (qp, qhp, qlp, nn - dn);
-
- return qh;
-#else
- return;
-#endif
-}
diff --git a/gmp/mpn/generic/sec_powm.c b/gmp/mpn/generic/sec_powm.c
deleted file mode 100644
index 67de44e10a..0000000000
--- a/gmp/mpn/generic/sec_powm.c
+++ /dev/null
@@ -1,438 +0,0 @@
-/* mpn_sec_powm -- Compute R = U^E mod M. Secure variant, side-channel silent
- under the assumption that the multiply instruction is side channel silent.
-
- Contributed to the GNU project by Torbjörn Granlund.
-
-Copyright 2007-2009, 2011, 2012, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-/*
- BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
-
- 1. T <- (B^n * U) mod M Convert to REDC form
-
- 2. Compute table U^0, U^1, U^2... of E-dependent size
-
- 3. While there are more bits in E
- W <- power left-to-right base-k
-
-
- TODO:
-
- * Make getbits a macro, thereby allowing it to update the index operand.
- That will simplify the code using getbits. (Perhaps make getbits' sibling
- getbit then have similar form, for symmetry.)
-
- * Choose window size without looping. (Superoptimize or think(tm).)
-
- * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between
- redc_1 and redc_n. On such systems, we will switch to redc_2 causing
- slowdown.
-*/
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#undef MPN_REDC_1_SEC
-#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \
- do { \
- mp_limb_t cy; \
- cy = mpn_redc_1 (rp, up, mp, n, invm); \
- mpn_cnd_sub_n (cy, rp, rp, mp, n); \
- } while (0)
-
-#undef MPN_REDC_2_SEC
-#define MPN_REDC_2_SEC(rp, up, mp, n, mip) \
- do { \
- mp_limb_t cy; \
- cy = mpn_redc_2 (rp, up, mp, n, mip); \
- mpn_cnd_sub_n (cy, rp, rp, mp, n); \
- } while (0)
-
-#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
-#define WANT_REDC_2 1
-#endif
-
-/* Define our own mpn squaring function. We do this since we cannot use a
- native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
- SQR_TOOM2_THRESHOLD. This is so because of fixed size stack allocations
- made inside mpn_sqr_basecase. */
-
-#if HAVE_NATIVE_mpn_sqr_diagonal
-#define MPN_SQR_DIAGONAL(rp, up, n) \
- mpn_sqr_diagonal (rp, up, n)
-#else
-#define MPN_SQR_DIAGONAL(rp, up, n) \
- do { \
- mp_size_t _i; \
- for (_i = 0; _i < (n); _i++) \
- { \
- mp_limb_t ul, lpl; \
- ul = (up)[_i]; \
- umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \
- (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \
- } \
- } while (0)
-#endif
-
-
-#if ! HAVE_NATIVE_mpn_sqr_basecase
-/* The limit of the generic code is SQR_TOOM2_THRESHOLD. */
-#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD
-#endif
-
-#if HAVE_NATIVE_mpn_sqr_basecase
-#ifdef TUNE_SQR_TOOM2_MAX
-/* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly
- mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly
- file. An assembly mpn_sqr_basecase that does not define it, should allow
- any size. */
-#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD
-#endif
-#endif
-
-#ifdef WANT_FAT_BINARY
-/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from
- __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we
- limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */
-#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD
-#endif
-
-#ifndef SQR_BASECASE_LIM
-/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand
- size. */
-#define mpn_local_sqr(rp,up,n,tp) mpn_sqr_basecase(rp,up,n)
-#else
-/* Define our own squaring function, which uses mpn_sqr_basecase for its
- allowed sizes, but its own code for larger sizes. */
-static void
-mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
-{
- mp_size_t i;
-
- ASSERT (n >= 1);
- ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));
-
- if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM))
- {
- mpn_sqr_basecase (rp, up, n);
- return;
- }
-
- {
- mp_limb_t ul, lpl;
- ul = up[0];
- umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
- rp[0] = lpl >> GMP_NAIL_BITS;
- }
- if (n > 1)
- {
- mp_limb_t cy;
-
- cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
- tp[n - 1] = cy;
- for (i = 2; i < n; i++)
- {
- mp_limb_t cy;
- cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
- tp[n + i - 2] = cy;
- }
- MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);
-
- {
- mp_limb_t cy;
-#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#else
- cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
- cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
-#endif
- rp[2 * n - 1] += cy;
- }
- }
-}
-#endif
-
-#define getbit(p,bi) \
- ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1)
-
-/* FIXME: Maybe some things would get simpler if all callers ensure
- that bi >= nbits. As far as I understand, with the current code bi
- < nbits can happen only for the final iteration. */
-static inline mp_limb_t
-getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
-{
- int nbits_in_r;
- mp_limb_t r;
- mp_size_t i;
-
- if (bi < nbits)
- {
- return p[0] & (((mp_limb_t) 1 << bi) - 1);
- }
- else
- {
- bi -= nbits; /* bit index of low bit to extract */
- i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */
- bi %= GMP_NUMB_BITS; /* bit index in low word */
- r = p[i] >> bi; /* extract (low) bits */
- nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */
- if (nbits_in_r < nbits) /* did we get enough bits? */
- r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
- return r & (((mp_limb_t ) 1 << nbits) - 1);
- }
-}
-
-#ifndef POWM_SEC_TABLE
-#if GMP_NUMB_BITS < 50
-#define POWM_SEC_TABLE 2,33,96,780,2741
-#else
-#define POWM_SEC_TABLE 2,130,524,2578
-#endif
-#endif
-
-#if TUNE_PROGRAM_BUILD
-extern int win_size (mp_bitcnt_t);
-#else
-static inline int
-win_size (mp_bitcnt_t enb)
-{
- int k;
- /* Find k, such that x[k-1] < enb <= x[k].
-
- We require that x[k] >= k, then it follows that enb > x[k-1] >=
- k-1, which implies k <= enb.
- */
- static const mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
- for (k = 1; enb > x[k]; k++)
- ;
- ASSERT (k <= enb);
- return k;
-}
-#endif
-
-/* Convert U to REDC form, U_r = B^n * U mod M.
- Uses scratch space at tp of size 2un + n + 1. */
-static void
-redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
-{
- MPN_ZERO (tp, n);
- MPN_COPY (tp + n, up, un);
-
- mpn_sec_div_r (tp, un + n, mp, n, tp + un + n);
- MPN_COPY (rp, tp, n);
-}
-
-/* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n},
- where en = ceil (enb / GMP_NUMB_BITS)
- Requires that {mp, n} is odd (and hence also mp[0] odd).
- Uses scratch space at tp as defined by mpn_sec_powm_itch. */
-void
-mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
- mp_srcptr ep, mp_bitcnt_t enb,
- mp_srcptr mp, mp_size_t n, mp_ptr tp)
-{
- mp_limb_t ip[2], *mip;
- int windowsize, this_windowsize;
- mp_limb_t expbits;
- mp_ptr pp, this_pp;
- long i;
- int cnd;
-
- ASSERT (enb > 0);
- ASSERT (n > 0);
- /* The code works for bn = 0, but the defined scratch space is 2 limbs
- greater than we supply, when converting 1 to redc form . */
- ASSERT (bn > 0);
- ASSERT ((mp[0] & 1) != 0);
-
- windowsize = win_size (enb);
-
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- {
- mip = ip;
- binvert_limb (mip[0], mp[0]);
- mip[0] = -mip[0];
- }
- else
- {
- mip = ip;
- mpn_binvert (mip, mp, 2, tp);
- mip[0] = -mip[0]; mip[1] = ~mip[1];
- }
-#else
- mip = ip;
- binvert_limb (mip[0], mp[0]);
- mip[0] = -mip[0];
-#endif
-
- pp = tp;
- tp += (n << windowsize); /* put tp after power table */
-
- /* Compute pp[0] table entry */
- /* scratch: | n | 1 | n+2 | */
- /* | pp[0] | 1 | redcify | */
- this_pp = pp;
- this_pp[n] = 1;
- redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
- this_pp += n;
-
- /* Compute pp[1] table entry. To avoid excessive scratch usage in the
- degenerate situation where B >> M, we let redcify use scratch space which
- will later be used by the pp table (element 2 and up). */
- /* scratch: | n | n | bn + n + 1 | */
- /* | pp[0] | pp[1] | redcify | */
- redcify (this_pp, bp, bn, mp, n, this_pp + n);
-
- /* Precompute powers of b and put them in the temporary area at pp. */
- /* scratch: | n | n | ... | | 2n | */
- /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */
- for (i = (1 << windowsize) - 2; i > 0; i--)
- {
- mpn_mul_basecase (tp, this_pp, n, pp + n, n);
- this_pp += n;
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
- else
- MPN_REDC_2_SEC (this_pp, tp, mp, n, mip);
-#else
- MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
-#endif
- }
-
- expbits = getbits (ep, enb, windowsize);
- ASSERT_ALWAYS (enb >= windowsize);
- enb -= windowsize;
-
- mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits);
-
- /* Main exponentiation loop. */
- /* scratch: | n | n | ... | | 3n-4n | */
- /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */
-
-#define INNERLOOP \
- while (enb != 0) \
- { \
- expbits = getbits (ep, enb, windowsize); \
- this_windowsize = windowsize; \
- if (enb < windowsize) \
- { \
- this_windowsize -= windowsize - enb; \
- enb = 0; \
- } \
- else \
- enb -= windowsize; \
- \
- do \
- { \
- mpn_local_sqr (tp, rp, n, tp + 2 * n); \
- MPN_REDUCE (rp, tp, mp, n, mip); \
- this_windowsize--; \
- } \
- while (this_windowsize != 0); \
- \
- mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); \
- mpn_mul_basecase (tp, rp, n, tp + 2*n, n); \
- \
- MPN_REDUCE (rp, tp, mp, n, mip); \
- }
-
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1_SEC (rp, tp, mp, n, mip[0])
- INNERLOOP;
- }
- else
- {
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2_SEC (rp, tp, mp, n, mip)
- INNERLOOP;
- }
-#else
-#undef MPN_MUL_N
-#undef MPN_SQR
-#undef MPN_REDUCE
-#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
-#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1_SEC (rp, tp, mp, n, mip[0])
- INNERLOOP;
-#endif
-
- MPN_COPY (tp, rp, n);
- MPN_ZERO (tp + n, n);
-
-#if WANT_REDC_2
- if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]);
- else
- MPN_REDC_2_SEC (rp, tp, mp, n, mip);
-#else
- MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]);
-#endif
- cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */
- mpn_cnd_sub_n (!cnd, rp, rp, mp, n);
-}
-
-mp_size_t
-mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n)
-{
- int windowsize;
- mp_size_t redcify_itch, itch;
-
- /* The top scratch usage will either be when reducing B in the 2nd redcify
- call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It
- is 3n or 4n depending on if we use mpn_local_sqr or a native
- mpn_sqr_basecase. We assume 4n always for now.) */
-
- windowsize = win_size (enb);
-
- /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
- the (bn + n) term is due to redcify's own usage, and the rest is due to
- mpn_sec_div_r's usage when called from redcify. */
- redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2);
-
- /* The n * 2^windowsize term is due to the power table, the 4n term is due to
- scratch needs of squaring/multiplication in the exponentiation loop. */
- itch = (n << windowsize) + (4 * n);
-
- return MAX (itch, redcify_itch);
-}
diff --git a/gmp/mpn/generic/sec_sqr.c b/gmp/mpn/generic/sec_sqr.c
deleted file mode 100644
index 736924cc22..0000000000
--- a/gmp/mpn/generic/sec_sqr.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* mpn_sec_sqr.
-
- Contributed to the GNU project by Torbjörn Granlund.
-
-Copyright 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_sec_sqr (mp_ptr rp,
- mp_srcptr ap, mp_size_t an,
- mp_ptr tp)
-{
- mpn_sqr_basecase (rp, ap, an);
-}
-
-mp_size_t
-mpn_sec_sqr_itch (mp_size_t an)
-{
- return 0;
-}
diff --git a/gmp/mpn/generic/sec_tabselect.c b/gmp/mpn/generic/sec_tabselect.c
deleted file mode 100644
index a79c73a575..0000000000
--- a/gmp/mpn/generic/sec_tabselect.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/* mpn_sec_tabselect.
-
-Copyright 2007-2009, 2011, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-/* Select entry `which' from table `tab', which has nents entries, each `n'
- limbs. Store the selected entry at rp. Reads entire table to avoid
- side-channel information leaks. O(n*nents). */
-void
-mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab,
- mp_size_t n, mp_size_t nents, mp_size_t which)
-{
- mp_size_t k, i;
- mp_limb_t mask;
- volatile mp_limb_t *tp;
-
- for (k = 0; k < nents; k++)
- {
- mask = -(mp_limb_t) (which == k);
- tp = tab + n * k;
- for (i = 0; i < n; i++)
- {
- rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
- }
- }
-}
diff --git a/gmp/mpn/generic/set_str.c b/gmp/mpn/generic/set_str.c
index 71034e34bf..975cfb0dad 100644
--- a/gmp/mpn/generic/set_str.c
+++ b/gmp/mpn/generic/set_str.c
@@ -9,34 +9,23 @@
FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE
GNU MP RELEASE.
-Copyright 1991-1994, 1996, 2000-2002, 2004, 2006-2008, 2012, 2013 Free
-Software Foundation, Inc.
+Copyright 1991, 1992, 1993, 1994, 1996, 2000, 2001, 2002, 2004, 2006, 2007,
+2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* TODO:
@@ -80,7 +69,7 @@ mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
int next_bitpos;
mp_limb_t res_digit;
mp_size_t size;
- int bits_per_indigit = mp_bases[base].big_base;
+ int bits_per_indigit = __mp_bases[base].big_base;
size = 0;
res_digit = 0;
@@ -118,7 +107,7 @@ mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
TMP_MARK;
- chars_per_limb = mp_bases[base].chars_per_limb;
+ chars_per_limb = __mp_bases[base].chars_per_limb;
un = str_len / chars_per_limb + 1;
@@ -142,15 +131,18 @@ mpn_set_str_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, i
long i, pi;
mp_size_t n;
mp_ptr p, t;
- mp_limb_t big_base;
+ unsigned normalization_steps;
+ mp_limb_t big_base, big_base_inverted;
int chars_per_limb;
size_t digits_in_base;
mp_size_t shift;
powtab_mem_ptr = powtab_mem;
- chars_per_limb = mp_bases[base].chars_per_limb;
- big_base = mp_bases[base].big_base;
+ chars_per_limb = __mp_bases[base].chars_per_limb;
+ big_base = __mp_bases[base].big_base;
+ big_base_inverted = __mp_bases[base].big_base_inverted;
+ count_leading_zeros (normalization_steps, big_base);
p = powtab_mem_ptr;
powtab_mem_ptr += 1;
@@ -177,7 +169,7 @@ mpn_set_str_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, i
ASSERT_ALWAYS (powtab_mem_ptr < powtab_mem + mpn_dc_set_str_powtab_alloc (un));
- mpn_sqr (t, p, n);
+ mpn_sqr_n (t, p, n);
n = 2 * n - 1; n += t[n] != 0;
digits_in_base *= 2;
#if 1
@@ -247,9 +239,7 @@ mpn_dc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len,
if (hn == 0)
{
- /* Zero +1 limb here, to avoid reading an allocated but uninitialised
- limb in mpn_incr_u below. */
- MPN_ZERO (rp, powtab->n + sn + 1);
+ MPN_ZERO (rp, powtab->n + sn);
}
else
{
@@ -288,11 +278,11 @@ mpn_bc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
mp_limb_t res_digit;
ASSERT (base >= 2);
- ASSERT (base < numberof (mp_bases));
+ ASSERT (base < numberof (__mp_bases));
ASSERT (str_len >= 1);
- big_base = mp_bases[base].big_base;
- chars_per_limb = mp_bases[base].chars_per_limb;
+ big_base = __mp_bases[base].big_base;
+ chars_per_limb = __mp_bases[base].chars_per_limb;
size = 0;
for (i = chars_per_limb; i < str_len; i += chars_per_limb)
diff --git a/gmp/mpn/generic/sizeinbase.c b/gmp/mpn/generic/sizeinbase.c
index 16633569ec..edd10b544e 100644
--- a/gmp/mpn/generic/sizeinbase.c
+++ b/gmp/mpn/generic/sizeinbase.c
@@ -4,34 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 1991, 1993-1995, 2001, 2002, 2011, 2012 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1995, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -44,7 +32,27 @@ see https://www.gnu.org/licenses/. */
size_t
mpn_sizeinbase (mp_srcptr xp, mp_size_t xsize, int base)
{
- size_t result;
- MPN_SIZEINBASE (result, xp, xsize, base);
- return result;
+ int lb_base, cnt;
+ mp_size_t totbits;
+
+ ASSERT (xsize >= 0);
+ ASSERT (base >= 2);
+ ASSERT (base < numberof (__mp_bases));
+
+ /* Special case for X == 0. */
+ if (xsize == 0)
+ return 1;
+
+ /* Calculate the total number of significant bits of X. */
+ count_leading_zeros (cnt, xp[xsize-1]);
+ totbits = xsize * BITS_PER_MP_LIMB - cnt;
+
+ if (POW2_P (base))
+ {
+ /* Special case for powers of 2, giving exact result. */
+ lb_base = __mp_bases[base].big_base;
+ return (totbits + lb_base - 1) / lb_base;
+ }
+ else
+ return (size_t) (totbits * __mp_bases[base].chars_per_bit_exactly) + 1;
}
diff --git a/gmp/mpn/generic/sqr.c b/gmp/mpn/generic/sqr.c
deleted file mode 100644
index 3743761f78..0000000000
--- a/gmp/mpn/generic/sqr.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* mpn_sqr -- square natural numbers.
-
-Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-void
-mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
-{
- ASSERT (n >= 1);
- ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
-
- if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
- { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
- mpn_mul_basecase (p, a, n, a, n);
- }
- else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))
- {
- mpn_sqr_basecase (p, a, n);
- }
- else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
- {
- /* Allocate workspace of fixed size on stack: fast! */
- mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)];
- ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
- mpn_toom2_sqr (p, a, n, ws);
- }
- else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
- {
- mp_ptr ws;
- TMP_SDECL;
- TMP_SMARK;
- ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n));
- mpn_toom3_sqr (p, a, n, ws);
- TMP_SFREE;
- }
- else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))
- {
- mp_ptr ws;
- TMP_SDECL;
- TMP_SMARK;
- ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n));
- mpn_toom4_sqr (p, a, n, ws);
- TMP_SFREE;
- }
- else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD))
- {
- mp_ptr ws;
- TMP_SDECL;
- TMP_SMARK;
- ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n));
- mpn_toom6_sqr (p, a, n, ws);
- TMP_SFREE;
- }
- else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
- {
- mp_ptr ws;
- TMP_DECL;
- TMP_MARK;
- ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n));
- mpn_toom8_sqr (p, a, n, ws);
- TMP_FREE;
- }
- else
- {
- /* The current FFT code allocates its own space. That should probably
- change. */
- mpn_fft_mul (p, a, n, a, n);
- }
-}
diff --git a/gmp/mpn/generic/sqr_basecase.c b/gmp/mpn/generic/sqr_basecase.c
index fc6a043a94..56d22216f6 100644
--- a/gmp/mpn/generic/sqr_basecase.c
+++ b/gmp/mpn/generic/sqr_basecase.c
@@ -5,34 +5,23 @@
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011 Free Software
-Foundation, Inc.
+Copyright 1991, 1992, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2003, 2004,
+2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -56,30 +45,6 @@ see https://www.gnu.org/licenses/. */
} while (0)
#endif
-#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
-#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \
- mpn_sqr_diag_addlsh1 (rp, tp, up, n)
-#else
-#if HAVE_NATIVE_mpn_addlsh1_n
-#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \
- do { \
- mp_limb_t cy; \
- MPN_SQR_DIAGONAL (rp, up, n); \
- cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); \
- rp[2 * n - 1] += cy; \
- } while (0)
-#else
-#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \
- do { \
- mp_limb_t cy; \
- MPN_SQR_DIAGONAL (rp, up, n); \
- cy = mpn_lshift (tp, tp, 2 * n - 2, 1); \
- cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); \
- rp[2 * n - 1] += cy; \
- } while (0)
-#endif
-#endif
-
#undef READY_WITH_mpn_sqr_basecase
@@ -89,12 +54,12 @@ void
mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
{
mp_size_t i;
- mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+ mp_limb_t tarr[2 * SQR_KARATSUBA_THRESHOLD];
mp_ptr tp = tarr;
mp_limb_t cy;
/* must fit 2*n limbs in tarr */
- ASSERT (n <= SQR_TOOM2_THRESHOLD);
+ ASSERT (n <= SQR_KARATSUBA_THRESHOLD);
if ((n & 1) != 0)
{
@@ -119,13 +84,9 @@ mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
{
if (n == 2)
{
-#if HAVE_NATIVE_mpn_mul_2
- rp[3] = mpn_mul_2 (rp, up, 2, up);
-#else
rp[0] = 0;
rp[1] = 0;
rp[3] = mpn_addmul_2 (rp, up, 2, up);
-#endif
return;
}
@@ -140,7 +101,15 @@ mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
tp[2 * n - 3] = cy;
}
- MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
+ MPN_SQR_DIAGONAL (rp, up, n);
+
+#if HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#else
+ cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+ cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#endif
+ rp[2 * n - 1] += cy;
}
#define READY_WITH_mpn_sqr_basecase
#endif
@@ -167,12 +136,12 @@ void
mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
{
mp_size_t i;
- mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+ mp_limb_t tarr[2 * SQR_KARATSUBA_THRESHOLD];
mp_ptr tp = tarr;
mp_limb_t cy;
/* must fit 2*n limbs in tarr */
- ASSERT (n <= SQR_TOOM2_THRESHOLD);
+ ASSERT (n <= SQR_KARATSUBA_THRESHOLD);
if ((n & 1) != 0)
{
@@ -225,13 +194,9 @@ mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
if (n == 2)
{
-#if HAVE_NATIVE_mpn_mul_2
- rp[3] = mpn_mul_2 (rp, up, 2, up);
-#else
rp[0] = 0;
rp[1] = 0;
rp[3] = mpn_addmul_2 (rp, up, 2, up);
-#endif
return;
}
@@ -303,12 +268,12 @@ mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
}
if (n > 1)
{
- mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+ mp_limb_t tarr[2 * SQR_KARATSUBA_THRESHOLD];
mp_ptr tp = tarr;
mp_limb_t cy;
/* must fit 2*n limbs in tarr */
- ASSERT (n <= SQR_TOOM2_THRESHOLD);
+ ASSERT (n <= SQR_KARATSUBA_THRESHOLD);
cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
tp[n - 1] = cy;
@@ -318,8 +283,18 @@ mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
tp[n + i - 2] = cy;
}
+ MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);
- MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
+ {
+ mp_limb_t cy;
+#if HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#else
+ cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+ cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#endif
+ rp[2 * n - 1] += cy;
+ }
}
}
#endif
diff --git a/gmp/mpn/generic/sqrmod_bnm1.c b/gmp/mpn/generic/sqrmod_bnm1.c
deleted file mode 100644
index fd0868b90b..0000000000
--- a/gmp/mpn/generic/sqrmod_bnm1.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/* sqrmod_bnm1.c -- squaring mod B^n-1.
-
- Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
- Marco Bodrato.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-/* Input is {ap,rn}; output is {rp,rn}, computation is
- mod B^rn - 1, and values are semi-normalised; zero is represented
- as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp.
- tp==rp is allowed. */
-static void
-mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
-{
- mp_limb_t cy;
-
- ASSERT (0 < rn);
-
- mpn_sqr (tp, ap, rn);
- cy = mpn_add_n (rp, tp, tp + rn, rn);
- /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
- * be no overflow when adding in the carry. */
- MPN_INCR_U (rp, rn, cy);
-}
-
-
-/* Input is {ap,rn+1}; output is {rp,rn+1}, in
- semi-normalised representation, computation is mod B^rn + 1. Needs
- a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
- Output is normalised. */
-static void
-mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
-{
- mp_limb_t cy;
-
- ASSERT (0 < rn);
-
- mpn_sqr (tp, ap, rn + 1);
- ASSERT (tp[2*rn+1] == 0);
- ASSERT (tp[2*rn] < GMP_NUMB_MAX);
- cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
- rp[rn] = 0;
- MPN_INCR_U (rp, rn+1, cy );
-}
-
-
-/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
- *
- * The result is expected to be ZERO if and only if the operand
- * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
- * B^rn-1.
- * It should not be a problem if sqrmod_bnm1 is used to
- * compute the full square with an <= 2*rn, because this condition
- * implies (B^an-1)^2 < (B^rn-1) .
- *
- * Requires rn/4 < an <= rn
- * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
- *
- * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
- */
-void
-mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
-{
- ASSERT (0 < an);
- ASSERT (an <= rn);
-
- if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
- {
- if (UNLIKELY (an < rn))
- {
- if (UNLIKELY (2*an <= rn))
- {
- mpn_sqr (rp, ap, an);
- }
- else
- {
- mp_limb_t cy;
- mpn_sqr (tp, ap, an);
- cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
- MPN_INCR_U (rp, rn, cy);
- }
- }
- else
- mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
- }
- else
- {
- mp_size_t n;
- mp_limb_t cy;
- mp_limb_t hi;
-
- n = rn >> 1;
-
- ASSERT (2*an > n);
-
- /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
- and crt together as
-
- x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
- */
-
-#define a0 ap
-#define a1 (ap + n)
-
-#define xp tp /* 2n + 2 */
- /* am1 maybe in {xp, n} */
-#define sp1 (tp + 2*n + 2)
- /* ap1 maybe in {sp1, n + 1} */
-
- {
- mp_srcptr am1;
- mp_size_t anm;
- mp_ptr so;
-
- if (LIKELY (an > n))
- {
- so = xp + n;
- am1 = xp;
- cy = mpn_add (xp, a0, n, a1, an - n);
- MPN_INCR_U (xp, n, cy);
- anm = n;
- }
- else
- {
- so = xp;
- am1 = a0;
- anm = an;
- }
-
- mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
- }
-
- {
- int k;
- mp_srcptr ap1;
- mp_size_t anp;
-
- if (LIKELY (an > n)) {
- ap1 = sp1;
- cy = mpn_sub (sp1, a0, n, a1, an - n);
- sp1[n] = 0;
- MPN_INCR_U (sp1, n + 1, cy);
- anp = n + ap1[n];
- } else {
- ap1 = a0;
- anp = an;
- }
-
- if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
- k=0;
- else
- {
- int mask;
- k = mpn_fft_best_k (n, 1);
- mask = (1<<k) -1;
- while (n & mask) {k--; mask >>=1;};
- }
- if (k >= FFT_FIRST_K)
- xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
- else if (UNLIKELY (ap1 == a0))
- {
- ASSERT (anp <= n);
- ASSERT (2*anp > n);
- mpn_sqr (xp, a0, an);
- anp = 2*an - n;
- cy = mpn_sub (xp, xp, n, xp + n, anp);
- xp[n] = 0;
- MPN_INCR_U (xp, n+1, cy);
- }
- else
- mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
- }
-
- /* Here the CRT recomposition begins.
-
- xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
- Division by 2 is a bitwise rotation.
-
- Assumes xp normalised mod (B^n+1).
-
- The residue class [0] is represented by [B^n-1]; except when
- both input are ZERO.
- */
-
-#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
-#if HAVE_NATIVE_mpn_rsh1add_nc
- cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
- hi = cy << (GMP_NUMB_BITS - 1);
- cy = 0;
- /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
- overflows, i.e. a further increment will not overflow again. */
-#else /* ! _nc */
- cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
- hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
- cy >>= 1;
- /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
- the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
-#endif
-#if GMP_NAIL_BITS == 0
- add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi);
-#else
- cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
- rp[n-1] ^= hi;
-#endif
-#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
-#if HAVE_NATIVE_mpn_add_nc
- cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
-#else /* ! _nc */
- cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
-#endif
- cy += (rp[0]&1);
- mpn_rshift(rp, rp, n, 1);
- ASSERT (cy <= 2);
- hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
- cy >>= 1;
- /* We can have cy != 0 only if hi = 0... */
- ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
- rp[n-1] |= hi;
- /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
-#endif
- ASSERT (cy <= 1);
- /* Next increment can not overflow, read the previous comments about cy. */
- ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
- MPN_INCR_U(rp, n, cy);
-
- /* Compute the highest half:
- ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
- */
- if (UNLIKELY (2*an < rn))
- {
- /* Note that in this case, the only way the result can equal
- zero mod B^{rn} - 1 is if the input is zero, and
- then the output of both the recursive calls and this CRT
- reconstruction is zero, not B^{rn} - 1. */
- cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
-
- /* FIXME: This subtraction of the high parts is not really
- necessary, we do it to get the carry out, and for sanity
- checking. */
- cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
- xp + 2*an - n, rn - 2*an, cy);
- ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
- cy = mpn_sub_1 (rp, rp, 2*an, cy);
- ASSERT (cy == (xp + 2*an - n)[0]);
- }
- else
- {
- cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
- /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
- DECR will affect _at most_ the lowest n limbs. */
- MPN_DECR_U (rp, 2*n, cy);
- }
-#undef a0
-#undef a1
-#undef xp
-#undef sp1
- }
-}
-
-mp_size_t
-mpn_sqrmod_bnm1_next_size (mp_size_t n)
-{
- mp_size_t nh;
-
- if (BELOW_THRESHOLD (n, SQRMOD_BNM1_THRESHOLD))
- return n;
- if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
- return (n + (2-1)) & (-2);
- if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
- return (n + (4-1)) & (-4);
-
- nh = (n + 1) >> 1;
-
- if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD))
- return (n + (8-1)) & (-8);
-
- return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1));
-}
diff --git a/gmp/mpn/generic/sqrtrem.c b/gmp/mpn/generic/sqrtrem.c
index 7d0f120001..ac878c5083 100644
--- a/gmp/mpn/generic/sqrtrem.c
+++ b/gmp/mpn/generic/sqrtrem.c
@@ -8,34 +8,23 @@
INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
DISAPPEAR IN A FUTURE GMP RELEASE.
-Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012 Free Software Foundation,
+Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* See "Karatsuba Square Root", reference in gmp.texi. */
@@ -48,64 +37,64 @@ see https://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
-static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
+static const unsigned short invsqrttab[384] =
{
- 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
- 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
- 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
- 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
- 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
- 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
- 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
- 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
- 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
- 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
- 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
- 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
- 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
- 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
- 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
- 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
- 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
- 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
- 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
- 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
- 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
- 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
- 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
- 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
- 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
- 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
- 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
- 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
- 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
- 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
- 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
- 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
- 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
- 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
- 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
- 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
- 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
- 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
- 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
- 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
- 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
- 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
- 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
- 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
- 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
- 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
- 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
- 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
+ 0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
+ 0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
+ 0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
+ 0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
+ 0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
+ 0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
+ 0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
+ 0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
+ 0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
+ 0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
+ 0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
+ 0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
+ 0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
+ 0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
+ 0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
+ 0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
+ 0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
+ 0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
+ 0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
+ 0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
+ 0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
+ 0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
+ 0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
+ 0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
+ 0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
+ 0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
+ 0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
+ 0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
+ 0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
+ 0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
+ 0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
+ 0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
+ 0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
+ 0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
+ 0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
+ 0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
+ 0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
+ 0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
+ 0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
+ 0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
+ 0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
+ 0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
+ 0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
+ 0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
+ 0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
+ 0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
+ 0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
+ 0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100 /* sqrt(1/1f8)..sqrt(1/1ff) */
};
/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
#if GMP_NUMB_BITS > 32
-#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
+#define MAGIC 0x10000000000 /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
#else
-#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
+#define MAGIC 0x100000 /* 0xfee6f < MAGIC < 0x29cbc8 */
#endif
static mp_limb_t
@@ -126,16 +115,16 @@ mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
iteration convert from 1/sqrt(a) to sqrt(a). */
abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
- x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
+ x0 = invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
/* x0 is now an 8 bits approximation of 1/sqrt(a0) */
#if GMP_NUMB_BITS > 32
a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
- t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
+ t = (mp_limb_signed_t) (0x2000000000000l - 0x30000 - a1 * x0 * x0) >> 16;
x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
- /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
+ /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
t2 = x0 * (a0 >> (32-8));
t = t2 >> 25;
@@ -250,18 +239,14 @@ mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
q >>= 1;
if (c != 0)
c = mpn_add_n (np + l, np + l, sp + l, h);
- mpn_sqr (np + n, sp, l);
+ mpn_sqr_n (np + n, sp, l);
b = q + mpn_sub_n (np, np, np + n, 2 * l);
c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
q = mpn_add_1 (sp + l, sp + l, h, q);
if (c < 0)
{
-#if HAVE_NATIVE_mpn_addlsh1_n
- c += mpn_addlsh1_n (np, np, sp, n) + 2 * q;
-#else
c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
-#endif
c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
}
diff --git a/gmp/mpn/generic/sub.c b/gmp/mpn/generic/sub.c
index 3fbcbbe98b..ada3e91b83 100644
--- a/gmp/mpn/generic/sub.c
+++ b/gmp/mpn/generic/sub.c
@@ -5,28 +5,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define __GMP_FORCE_mpn_sub 1
diff --git a/gmp/mpn/generic/sub_1.c b/gmp/mpn/generic/sub_1.c
index db2e6f948f..4ed2eabccb 100644
--- a/gmp/mpn/generic/sub_1.c
+++ b/gmp/mpn/generic/sub_1.c
@@ -5,28 +5,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define __GMP_FORCE_mpn_sub_1 1
diff --git a/gmp/mpn/generic/sub_err1_n.c b/gmp/mpn/generic/sub_err1_n.c
deleted file mode 100644
index 340313a323..0000000000
--- a/gmp/mpn/generic/sub_err1_n.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* mpn_sub_err1_n -- sub_n with one error term
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
- return value is borrow out.
-
- (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
- Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_sub_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n));
-
- yp += n - 1;
- el = eh = 0;
-
- do
- {
- yl = *yp--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary sub_n */
- SUBC_LIMB (cy1, sl, ul, vl);
- SUBC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh:el) */
- zl = (-cy) & yl;
- el += zl;
- eh += el < zl;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS);
- el &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el;
- ep[1] = eh;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/sub_err2_n.c b/gmp/mpn/generic/sub_err2_n.c
deleted file mode 100644
index 63ea2451b4..0000000000
--- a/gmp/mpn/generic/sub_err2_n.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/* mpn_sub_err2_n -- sub_n with two error terms
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
- return value is borrow out.
-
- (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
- Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
- c[1]*yp2[n-1] + ... + c[n]*yp2[0],
- stores two-limb results at {ep,2} and {ep+2,2} respectively.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_sub_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n));
-
- yp1 += n - 1;
- yp2 += n - 1;
- el1 = eh1 = 0;
- el2 = eh2 = 0;
-
- do
- {
- yl1 = *yp1--;
- yl2 = *yp2--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary sub_n */
- SUBC_LIMB (cy1, sl, ul, vl);
- SUBC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh1:el1) */
- zl1 = (-cy) & yl1;
- el1 += zl1;
- eh1 += el1 < zl1;
-
- /* update (eh2:el2) */
- zl2 = (-cy) & yl2;
- el2 += zl2;
- eh2 += el2 < zl2;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
- el1 &= GMP_NUMB_MASK;
- eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
- el2 &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el1;
- ep[1] = eh1;
- ep[2] = el2;
- ep[3] = eh2;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/sub_err3_n.c b/gmp/mpn/generic/sub_err3_n.c
deleted file mode 100644
index a80e05d0d9..0000000000
--- a/gmp/mpn/generic/sub_err3_n.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/* mpn_sub_err3_n -- sub_n with three error terms
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/*
- Computes:
-
- (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
- return value is borrow out.
-
- (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
- Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
- c[1]*yp2[n-1] + ... + c[n]*yp2[0],
- c[1]*yp3[n-1] + ... + c[n]*yp3[0],
- stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively.
-
- Requires n >= 1.
-
- None of the outputs may overlap each other or any of the inputs, except
- that {rp,n} may be equal to {up,n} or {vp,n}.
-*/
-mp_limb_t
-mpn_sub_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
- mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3,
- mp_size_t n, mp_limb_t cy)
-{
- mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2;
-
- ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
- ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
- ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, up, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n));
- ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n));
-
- yp1 += n - 1;
- yp2 += n - 1;
- yp3 += n - 1;
- el1 = eh1 = 0;
- el2 = eh2 = 0;
- el3 = eh3 = 0;
-
- do
- {
- yl1 = *yp1--;
- yl2 = *yp2--;
- yl3 = *yp3--;
- ul = *up++;
- vl = *vp++;
-
- /* ordinary sub_n */
- SUBC_LIMB (cy1, sl, ul, vl);
- SUBC_LIMB (cy2, rl, sl, cy);
- cy = cy1 | cy2;
- *rp++ = rl;
-
- /* update (eh1:el1) */
- zl1 = (-cy) & yl1;
- el1 += zl1;
- eh1 += el1 < zl1;
-
- /* update (eh2:el2) */
- zl2 = (-cy) & yl2;
- el2 += zl2;
- eh2 += el2 < zl2;
-
- /* update (eh3:el3) */
- zl3 = (-cy) & yl3;
- el3 += zl3;
- eh3 += el3 < zl3;
- }
- while (--n);
-
-#if GMP_NAIL_BITS != 0
- eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
- el1 &= GMP_NUMB_MASK;
- eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
- el2 &= GMP_NUMB_MASK;
- eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS);
- el3 &= GMP_NUMB_MASK;
-#endif
-
- ep[0] = el1;
- ep[1] = eh1;
- ep[2] = el2;
- ep[3] = eh2;
- ep[4] = el3;
- ep[5] = eh3;
-
- return cy;
-}
diff --git a/gmp/mpn/generic/sub_n.c b/gmp/mpn/generic/sub_n.c
index 29de2d2d89..d33668fa86 100644
--- a/gmp/mpn/generic/sub_n.c
+++ b/gmp/mpn/generic/sub_n.c
@@ -1,32 +1,21 @@
/* mpn_sub_n -- Subtract equal length limb vectors.
-Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc.
+Copyright 1992, 1993, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -40,8 +29,8 @@ mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
mp_limb_t ul, vl, sl, rl, cy, cy1, cy2;
ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
- ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
cy = 0;
do
@@ -70,8 +59,8 @@ mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
mp_limb_t ul, vl, rl, cy;
ASSERT (n >= 1);
- ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
- ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
cy = 0;
do
diff --git a/gmp/mpn/generic/subcnd_n.c b/gmp/mpn/generic/subcnd_n.c
new file mode 100644
index 0000000000..0dcc45641d
--- /dev/null
+++ b/gmp/mpn/generic/subcnd_n.c
@@ -0,0 +1,85 @@
+/* mpn_subcnd_n -- Compute R = U - V if CND != 0 or R = U if CND == 0.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 1992, 1993, 1994, 1996, 2000, 2002, 2008, 2009 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_subcnd_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t cnd)
+{
+ mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask;
+
+ ASSERT (n >= 1);
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+
+ mask = -(mp_limb_t) (cnd != 0);
+ cy = 0;
+ do
+ {
+ ul = *up++;
+ vl = *vp++ & mask;
+ sl = ul - vl;
+ cy1 = sl > ul;
+ rl = sl - cy;
+ cy2 = rl > sl;
+ cy = cy1 | cy2;
+ *rp++ = rl;
+ }
+ while (--n != 0);
+
+ return cy;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 1
+
+mp_limb_t
+mpn_subcnd_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n, mp_limb_t cnd)
+{
+ mp_limb_t ul, vl, rl, cy, mask;
+
+ ASSERT (n >= 1);
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+ ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+
+ mask = -(mp_limb_t) (cnd != 0);
+ cy = 0;
+ do
+ {
+ ul = *up++;
+ vl = *vp++ & mask;
+ rl = ul - vl - cy;
+ cy = rl >> (GMP_LIMB_BITS - 1);
+ *rp++ = rl & GMP_NUMB_MASK;
+ }
+ while (--n != 0);
+
+ return cy;
+}
+
+#endif
diff --git a/gmp/mpn/generic/submul_1.c b/gmp/mpn/generic/submul_1.c
index fbc3501389..3e8e74302d 100644
--- a/gmp/mpn/generic/submul_1.c
+++ b/gmp/mpn/generic/submul_1.c
@@ -3,33 +3,23 @@
vector pointed to by RP. Return the most significant limb of the
product, adjusted for carry-out from the subtraction.
-Copyright 1992-1994, 1996, 2000, 2002, 2004 Free Software Foundation, Inc.
+Copyright 1992, 1993, 1994, 1996, 2000, 2002, 2004 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/generic/tdiv_qr.c b/gmp/mpn/generic/tdiv_qr.c
index be213b0467..8ac4d38813 100644
--- a/gmp/mpn/generic/tdiv_qr.c
+++ b/gmp/mpn/generic/tdiv_qr.c
@@ -1,43 +1,33 @@
/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If
qxn is non-zero, generate that many fraction limbs and append them after the
- other quotient limbs, and update the remainder accordingly. The input
+ other quotient limbs, and update the remainder accordningly. The input
operands are unaffected.
Preconditions:
1. The most significant limb of of the divisor must be non-zero.
- 2. nn >= dn, even if qxn is non-zero. (??? relax this ???)
+ 2. No argument overlap is permitted. (??? relax this ???)
+ 3. nn >= dn, even if qxn is non-zero. (??? relax this ???)
The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
complexity of multiplication.
-Copyright 1997, 2000-2002, 2005, 2009 Free Software Foundation, Inc.
+Copyright 1997, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -48,8 +38,13 @@ void
mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
{
+ /* FIXME:
+ 1. qxn
+ 2. pass allocated storage in additional parameter?
+ */
ASSERT_ALWAYS (qxn == 0);
+ ASSERT (qxn >= 0);
ASSERT (nn >= 0);
ASSERT (dn >= 0);
ASSERT (dn == 0 || dp[dn - 1] != 0);
@@ -63,7 +58,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
case 1:
{
- rp[0] = mpn_divrem_1 (qp, (mp_size_t) 0, np, nn, dp[0]);
+ rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
return;
}
@@ -82,7 +77,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
d2p = dtmp;
d2p[1] = (dp[1] << cnt) | (dp[0] >> (GMP_NUMB_BITS - cnt));
d2p[0] = (dp[0] << cnt) & GMP_NUMB_MASK;
- n2p = TMP_ALLOC_LIMBS (nn + 1);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
cy = mpn_lshift (n2p, np, nn, cnt);
n2p[nn] = cy;
qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
@@ -95,7 +90,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
else
{
d2p = (mp_ptr) dp;
- n2p = TMP_ALLOC_LIMBS (nn);
+ n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
MPN_COPY (n2p, np, nn);
qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */
@@ -109,13 +104,12 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
default:
{
int adjust;
- gmp_pi1_t dinv;
TMP_DECL;
TMP_MARK;
adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */
if (nn + adjust >= 2 * dn)
{
- mp_ptr n2p, d2p;
+ mp_ptr n2p, d2p, q2p;
mp_limb_t cy;
int cnt;
@@ -124,9 +118,9 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
{
count_leading_zeros (cnt, dp[dn - 1]);
cnt -= GMP_NAIL_BITS;
- d2p = TMP_ALLOC_LIMBS (dn);
+ d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
mpn_lshift (d2p, dp, dn, cnt);
- n2p = TMP_ALLOC_LIMBS (nn + 1);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
cy = mpn_lshift (n2p, np, nn, cnt);
n2p[nn] = cy;
nn += adjust;
@@ -135,28 +129,51 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
{
cnt = 0;
d2p = (mp_ptr) dp;
- n2p = TMP_ALLOC_LIMBS (nn + 1);
+ n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
MPN_COPY (n2p, np, nn);
n2p[nn] = 0;
nn += adjust;
}
- invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]);
- if (BELOW_THRESHOLD (dn, DC_DIV_QR_THRESHOLD))
- mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32);
- else if (BELOW_THRESHOLD (dn, MUPI_DIV_QR_THRESHOLD) || /* fast condition */
- BELOW_THRESHOLD (nn, 2 * MU_DIV_QR_THRESHOLD) || /* fast condition */
- (double) (2 * (MU_DIV_QR_THRESHOLD - MUPI_DIV_QR_THRESHOLD)) * dn /* slow... */
- + (double) MUPI_DIV_QR_THRESHOLD * nn > (double) dn * nn) /* ...condition */
- mpn_dcpi1_div_qr (qp, n2p, nn, d2p, dn, &dinv);
+ if (dn < DIV_DC_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
else
{
- mp_size_t itch = mpn_mu_div_qr_itch (nn, dn, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- mpn_mu_div_qr (qp, rp, n2p, nn, d2p, dn, scratch);
- n2p = rp;
+ /* Divide 2*dn / dn limbs as long as the limbs in np last. */
+ q2p = qp + nn - dn;
+ n2p += nn - dn;
+ do
+ {
+ q2p -= dn; n2p -= dn;
+ mpn_dc_divrem_n (q2p, n2p, d2p, dn);
+ nn -= dn;
+ }
+ while (nn >= 2 * dn);
+
+ if (nn != dn)
+ {
+ mp_limb_t ql;
+ n2p -= nn - dn;
+
+ /* We have now dn < nn - dn < 2dn. Make a recursive call,
+ since falling out to the code below isn't pretty.
+ Unfortunately, mpn_tdiv_qr returns nn-dn+1 quotient
+ limbs, which would overwrite one already generated
+ quotient limbs. Preserve it with an ugly hack. */
+ /* FIXME: This suggests that we should have an
+ mpn_tdiv_qr_internal that instead returns the most
+ significant quotient limb and move the meat of this
+ function there. */
+ /* FIXME: Perhaps call mpn_sb_divrem_mn here for certain
+ operand ranges, to decrease overhead for small
+ operands? */
+ ql = qp[nn - dn]; /* preserve quotient limb... */
+ mpn_tdiv_qr (qp, n2p, 0L, n2p, nn, d2p, dn);
+ qp[nn - dn] = ql; /* ...restore it again */
+ }
}
+
if (cnt != 0)
mpn_rshift (rp, n2p, dn, cnt);
else
@@ -229,11 +246,11 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
count_leading_zeros (cnt, dp[dn - 1]);
cnt -= GMP_NAIL_BITS;
- d2p = TMP_ALLOC_LIMBS (qn);
+ d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
mpn_lshift (d2p, dp + in, qn, cnt);
d2p[0] |= dp[in - 1] >> (GMP_NUMB_BITS - cnt);
- n2p = TMP_ALLOC_LIMBS (2 * qn + 1);
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
if (adjust)
{
@@ -250,7 +267,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
cnt = 0;
d2p = (mp_ptr) dp + in;
- n2p = TMP_ALLOC_LIMBS (2 * qn + 1);
+ n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
if (adjust)
{
@@ -263,30 +280,25 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
if (qn == 1)
{
mp_limb_t q0, r0;
- udiv_qrnnd (q0, r0, n2p[1], n2p[0] << GMP_NAIL_BITS, d2p[0] << GMP_NAIL_BITS);
- n2p[0] = r0 >> GMP_NAIL_BITS;
+ mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+ /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+ temps here. This doesn't hurt code quality on any machines
+ so we do it unconditionally. */
+ gcc272bug_n1 = n2p[1];
+ gcc272bug_n0 = n2p[0];
+ gcc272bug_d0 = d2p[0];
+ udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0 << GMP_NAIL_BITS,
+ gcc272bug_d0 << GMP_NAIL_BITS);
+ r0 >>= GMP_NAIL_BITS;
+ n2p[0] = r0;
qp[0] = q0;
}
else if (qn == 2)
- mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); /* FIXME: obsolete function */
+ mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+ else if (qn < DIV_DC_THRESHOLD)
+ mpn_sb_divrem_mn (qp, n2p, 2 * qn, d2p, qn);
else
- {
- invert_pi1 (dinv, d2p[qn - 1], d2p[qn - 2]);
- if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
- mpn_sbpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, dinv.inv32);
- else if (BELOW_THRESHOLD (qn, MU_DIV_QR_THRESHOLD))
- mpn_dcpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, &dinv);
- else
- {
- mp_size_t itch = mpn_mu_div_qr_itch (2 * qn, qn, 0);
- mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
- mp_ptr r2p = rp;
- if (np == r2p) /* If N and R share space, put ... */
- r2p += nn - qn; /* intermediate remainder at N's upper end. */
- mpn_mu_div_qr (qp, r2p, n2p, 2 * qn, d2p, qn, scratch);
- MPN_COPY (n2p, r2p, qn);
- }
- }
+ mpn_dc_divrem_n (qp, n2p, d2p, qn);
rn = qn;
/* Multiply the first ignored divisor limb by the most significant
@@ -304,7 +316,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
dl = dp[in - 2];
#if GMP_NAIL_BITS == 0
- x = (dp[in - 1] << cnt) | ((dl >> 1) >> ((~cnt) % GMP_LIMB_BITS));
+ x = (dp[in - 1] << cnt) | ((dl >> 1) >> ((~cnt) % BITS_PER_MP_LIMB));
#else
x = (dp[in - 1] << cnt) & GMP_NUMB_MASK;
if (cnt != 0)
@@ -354,7 +366,7 @@ mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
}
/* True: partial remainder now is neutral, i.e., it is not shifted up. */
- tp = TMP_ALLOC_LIMBS (dn);
+ tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
if (in < qn)
{
diff --git a/gmp/mpn/generic/toom22_mul.c b/gmp/mpn/generic/toom22_mul.c
index 36ac29b72d..6407bbeb96 100644
--- a/gmp/mpn/generic/toom22_mul.c
+++ b/gmp/mpn/generic/toom22_mul.c
@@ -7,33 +7,22 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2010, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
@@ -52,7 +41,7 @@ see https://www.gnu.org/licenses/. */
vinf= a1 * b1 # A(inf)*B(inf)
*/
-#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#if TUNE_PROGRAM_BUILD
#define MAYBE_mul_toom22 1
#else
#define MAYBE_mul_toom22 \
@@ -62,36 +51,18 @@ see https://www.gnu.org/licenses/. */
#define TOOM22_MUL_N_REC(p, a, b, n, ws) \
do { \
if (! MAYBE_mul_toom22 \
- || BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
+ || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
mpn_mul_basecase (p, a, n, b, n); \
else \
mpn_toom22_mul (p, a, n, b, n, ws); \
} while (0)
-/* Normally, this calls mul_basecase or toom22_mul. But when when the fraction
- MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD is large, an initially small
- relative unbalance will become a larger and larger relative unbalance with
- each recursion (the difference s-t will be invariant over recursive calls).
- Therefore, we need to call toom32_mul. FIXME: Suppress depending on
- MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD and on MUL_TOOM22_THRESHOLD. */
-#define TOOM22_MUL_REC(p, a, an, b, bn, ws) \
- do { \
- if (! MAYBE_mul_toom22 \
- || BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \
- mpn_mul_basecase (p, a, an, b, bn); \
- else if (4 * an < 5 * bn) \
- mpn_toom22_mul (p, a, an, b, bn, ws); \
- else \
- mpn_toom32_mul (p, a, an, b, bn, ws); \
- } while (0)
-
void
mpn_toom22_mul (mp_ptr pp,
mp_srcptr ap, mp_size_t an,
mp_srcptr bp, mp_size_t bn,
mp_ptr scratch)
{
- const int __gmpn_cpuvec_initialized = 1;
mp_size_t n, s, t;
int vm1_neg;
mp_limb_t cy, cy2;
@@ -179,8 +150,8 @@ mpn_toom22_mul (mp_ptr pp,
/* vm1, 2n limbs */
TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
- if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out);
- else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out);
+ /* vinf, s+t limbs */
+ mpn_mul (vinf, a1, s, b1, t);
/* v0, 2n limbs */
TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out);
diff --git a/gmp/mpn/generic/toom2_sqr.c b/gmp/mpn/generic/toom2_sqr.c
index 2f2fdaee6f..445cff8f5d 100644
--- a/gmp/mpn/generic/toom2_sqr.c
+++ b/gmp/mpn/generic/toom2_sqr.c
@@ -6,33 +6,22 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2010, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
@@ -43,23 +32,25 @@ see https://www.gnu.org/licenses/. */
<-s--><--n-->
____ ______
|_a1_|___a0_|
+ |b1_|___b0_|
+ <-t-><--n-->
- v0 = a0 ^2 # A(0)^2
- vm1 = (a0- a1)^2 # A(-1)^2
- vinf= a1 ^2 # A(inf)^2
+ v0 = a0 * b0 # A(0)*B(0)
+ vm1 = (a0- a1)*(b0- b1) # A(-1)*B(-1)
+ vinf= a1 * b1 # A(inf)*B(inf)
*/
-#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#if TUNE_PROGRAM_BUILD
#define MAYBE_sqr_toom2 1
#else
#define MAYBE_sqr_toom2 \
(SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD)
#endif
-#define TOOM2_SQR_REC(p, a, n, ws) \
+#define TOOM2_SQR_N_REC(p, a, n, ws) \
do { \
if (! MAYBE_sqr_toom2 \
- || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
+ || BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
mpn_sqr_basecase (p, a, n); \
else \
mpn_toom2_sqr (p, a, n, ws); \
@@ -70,7 +61,6 @@ mpn_toom2_sqr (mp_ptr pp,
mp_srcptr ap, mp_size_t an,
mp_ptr scratch)
{
- const int __gmpn_cpuvec_initialized = 1;
mp_size_t n, s;
mp_limb_t cy, cy2;
mp_ptr asm1;
@@ -113,16 +103,15 @@ mpn_toom2_sqr (mp_ptr pp,
#define v0 pp /* 2n */
#define vinf (pp + 2 * n) /* s+s */
#define vm1 scratch /* 2n */
-#define scratch_out scratch + 2 * n
/* vm1, 2n limbs */
- TOOM2_SQR_REC (vm1, asm1, n, scratch_out);
+ TOOM2_SQR_N_REC (vm1, asm1, n, scratch);
/* vinf, s+s limbs */
- TOOM2_SQR_REC (vinf, a1, s, scratch_out);
+ TOOM2_SQR_N_REC (vinf, a1, s, scratch);
/* v0, 2n limbs */
- TOOM2_SQR_REC (v0, ap, n, scratch_out);
+ TOOM2_SQR_N_REC (v0, ap, n, scratch);
/* H(v0) + L(vinf) */
cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);
diff --git a/gmp/mpn/generic/toom32_mul.c b/gmp/mpn/generic/toom32_mul.c
index 0b05669cc4..7bdd688a53 100644
--- a/gmp/mpn/generic/toom32_mul.c
+++ b/gmp/mpn/generic/toom32_mul.c
@@ -2,7 +2,6 @@
times as large as bn. Or more accurately, bn < an < 3bn.
Contributed to the GNU project by Torbjorn Granlund.
- Improvements by Marco Bodrato and Niels Möller.
The idea of applying toom to unbalanced multiplication is due to Marco
Bodrato and Alberto Zanoni.
@@ -11,34 +10,32 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2010 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+/*
+ Things to work on:
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch allocation.
+ 2. Apply optimizations also to mul_toom42.c.
+*/
#include "gmp.h"
#include "gmp-impl.h"
@@ -57,9 +54,20 @@ see https://www.gnu.org/licenses/. */
vinf= a2 * b1 # A(inf)*B(inf)
*/
-#define TOOM32_MUL_N_REC(p, a, b, n, ws) \
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_mul_toom22 1
+#else
+#define MAYBE_mul_toom22 \
+ (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD)
+#endif
+
+#define TOOM22_MUL_N_REC(p, a, b, n, ws) \
do { \
- mpn_mul_n (p, a, b, n); \
+ if (! MAYBE_mul_toom22 \
+ || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else \
+ mpn_toom22_mul (p, a, n, b, n, ws); \
} while (0)
void
@@ -70,9 +78,15 @@ mpn_toom32_mul (mp_ptr pp,
{
mp_size_t n, s, t;
int vm1_neg;
+#if HAVE_NATIVE_mpn_add_nc
mp_limb_t cy;
- mp_limb_signed_t hi;
- mp_limb_t ap1_hi, bp1_hi;
+#else
+ mp_limb_t cy, cy2;
+#endif
+ mp_ptr a0_a2;
+ mp_ptr as1, asm1;
+ mp_ptr bs1, bsm1;
+ TMP_DECL;
#define a0 ap
#define a1 (ap + n)
@@ -80,9 +94,6 @@ mpn_toom32_mul (mp_ptr pp,
#define b0 bp
#define b1 (bp + n)
- /* Required, to ensure that s + t >= n. */
- ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
-
n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
s = an - 2 * n;
@@ -90,234 +101,191 @@ mpn_toom32_mul (mp_ptr pp,
ASSERT (0 < s && s <= n);
ASSERT (0 < t && t <= n);
- ASSERT (s + t >= n);
- /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
-#define ap1 (pp) /* n, most significant limb in ap1_hi */
-#define bp1 (pp + n) /* n, most significant bit in bp1_hi */
-#define am1 (pp + 2*n) /* n, most significant bit in hi */
-#define bm1 (pp + 3*n) /* n */
-#define v1 (scratch) /* 2n + 1 */
-#define vm1 (pp) /* 2n + 1 */
-#define scratch_out (scratch + 2*n + 1) /* Currently unused. */
+ TMP_MARK;
+
+ as1 = TMP_SALLOC_LIMBS (n + 1);
+ asm1 = TMP_SALLOC_LIMBS (n + 1);
- /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
+ bs1 = TMP_SALLOC_LIMBS (n + 1);
+ bsm1 = TMP_SALLOC_LIMBS (n);
- /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
+ a0_a2 = pp;
- /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
- ap1_hi = mpn_add (ap1, a0, n, a2, s);
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
+ /* Compute as1 and asm1. */
+ a0_a2[n] = mpn_add (a0_a2, a0, n, a2, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
{
- ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
- hi = 0;
+ cy = mpn_addsub_n (as1, asm1, a1, a0_a2, n);
+ as1[n] = cy >> 1;
+ asm1[n] = 0;
vm1_neg = 1;
}
else
{
- cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
- hi = ap1_hi - (cy & 1);
- ap1_hi += (cy >> 1);
+ cy = mpn_addsub_n (as1, asm1, a0_a2, a1, n);
+ as1[n] = a0_a2[n] + (cy >> 1);
+ asm1[n] = a0_a2[n] - (cy & 1);
vm1_neg = 0;
}
#else
- if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
+ as1[n] = a0_a2[n] + mpn_add_n (as1, a0_a2, a1, n);
+ if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0)
{
- ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
- hi = 0;
+ mpn_sub_n (asm1, a1, a0_a2, n);
+ asm1[n] = 0;
vm1_neg = 1;
}
else
{
- hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
+ cy = mpn_sub_n (asm1, a0_a2, a1, n);
+ asm1[n] = a0_a2[n] - cy;
vm1_neg = 0;
}
- ap1_hi += mpn_add_n (ap1, ap1, a1, n);
#endif
- /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
+ /* Compute bs1 and bsm1. */
if (t == n)
{
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (mpn_cmp (b0, b1, n) < 0)
{
- cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
+ cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
vm1_neg ^= 1;
}
else
{
- cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
+ cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
}
- bp1_hi = cy >> 1;
+ bs1[n] = cy >> 1;
#else
- bp1_hi = mpn_add_n (bp1, b0, b1, n);
+ bs1[n] = mpn_add_n (bs1, b0, b1, n);
if (mpn_cmp (b0, b1, n) < 0)
{
- ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
+ mpn_sub_n (bsm1, b1, b0, n);
vm1_neg ^= 1;
}
else
{
- ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
+ mpn_sub_n (bsm1, b0, b1, n);
}
#endif
}
else
{
- /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
- bp1_hi = mpn_add (bp1, b0, n, b1, t);
+ bs1[n] = mpn_add (bs1, b0, n, b1, t);
if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
{
- ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
- MPN_ZERO (bm1 + t, n - t);
+ mpn_sub_n (bsm1, b1, b0, t);
+ MPN_ZERO (bsm1 + t, n - t);
vm1_neg ^= 1;
}
else
{
- ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
+ mpn_sub (bsm1, b0, n, b1, t);
}
}
- TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
- if (ap1_hi == 1)
+ ASSERT (as1[n] <= 2);
+ ASSERT (bs1[n] <= 1);
+ ASSERT (asm1[n] <= 1);
+/*ASSERT (bsm1[n] == 0); */
+
+#define v0 pp /* 2n */
+#define v1 (scratch) /* 2n+1 */
+#define vinf (pp + 3 * n) /* s+t */
+#define vm1 (scratch + 2 * n + 1) /* 2n+1 */
+#define scratch_out scratch + 4 * n + 2
+
+ /* vm1, 2n+1 limbs */
+ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
+ cy = 0;
+ if (asm1[n] != 0)
+ cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
+ vm1[2 * n] = cy;
+
+ /* vinf, s+t limbs */
+ if (s > t) mpn_mul (vinf, a2, s, b1, t);
+ else mpn_mul (vinf, b1, t, a2, s);
+
+ /* v1, 2n+1 limbs */
+ TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out);
+ if (as1[n] == 1)
{
- cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
+ cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
}
- else if (ap1_hi == 2)
+ else if (as1[n] == 2)
{
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
+ cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
#else
- cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
+ cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
#endif
}
else
cy = 0;
- if (bp1_hi != 0)
- cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
+ if (bs1[n] != 0)
+ cy += mpn_add_n (v1 + n, v1 + n, as1, n);
v1[2 * n] = cy;
- TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
- if (hi)
- hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
+ mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
- vm1[2*n] = hi;
+ /* Interpolate */
- /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
if (vm1_neg)
{
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
+#if HAVE_NATIVE_mpn_rsh1add_n
+ mpn_rsh1add_n (vm1, v1, vm1, 2 * n + 1);
#else
- mpn_sub_n (v1, v1, vm1, 2*n+1);
- ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
+ mpn_add_n (vm1, v1, vm1, 2 * n + 1);
+ mpn_rshift (vm1, vm1, 2 * n + 1, 1);
#endif
}
else
{
-#if HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
+#if HAVE_NATIVE_mpn_rsh1sub_n
+ mpn_rsh1sub_n (vm1, v1, vm1, 2 * n + 1);
#else
- mpn_add_n (v1, v1, vm1, 2*n+1);
- ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
+ mpn_sub_n (vm1, v1, vm1, 2 * n + 1);
+ mpn_rshift (vm1, vm1, 2 * n + 1, 1);
#endif
}
- /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
-
- y = x1 + x3 + (x0 + x2) * B
- = (x0 + x2) * B + (x0 + x2) - vm1.
-
- y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
- follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
- (already in place, except for carry propagation).
+ mpn_sub_n (v1, v1, vm1, 2 * n + 1);
+ v1[2 * n] -= mpn_sub_n (v1, v1, v0, 2 * n);
- We thus add
+ /*
+ pp[] prior to operations:
+ |_H vinf|_L vinf|_______|_______|_______|
- B^3 B^2 B 1
- | | | |
- +-----+----+
- + | x0 + x2 |
- +----+-----+----+
- + | x0 + x2 |
- +----------+
- - | vm1 |
- --+----++----+----+-
- | y2 | y1 | y0 |
- +-----+----+----+
-
- Since we store y0 at the same location as the low half of x0 + x2, we
- need to do the middle sum first. */
-
- hi = vm1[2*n];
- cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
- MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
-
- /* FIXME: Can we get rid of this second vm1_neg conditional by
- swapping the location of +1 and -1 values? */
- if (vm1_neg)
- {
- cy = mpn_add_n (v1, v1, vm1, n);
- hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
- MPN_INCR_U (v1 + n, n+1, hi);
- }
- else
- {
- cy = mpn_sub_n (v1, v1, vm1, n);
- hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
- MPN_DECR_U (v1 + n, n+1, hi);
- }
-
- TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
- /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
- if (s > t) mpn_mul (pp+3*n, a2, s, b1, t);
- else mpn_mul (pp+3*n, b1, t, a2, s);
-
- /* Remaining interpolation.
-
- y * B + x0 + x3 B^3 - x0 B^2 - x3 B
- = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
- = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
- + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
- = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
- + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
-
- B^4 B^3 B^2 B 1
- | | | | | |
- +-------+ +---------+---------+
- | Hx3 | | Hx0-Lx3 | Lx0 |
- +------+----------+---------+---------+---------+
- | y2 | y1 | y0 |
- ++---------+---------+---------+
- -| Hx0-Lx3 | - Lx0 |
- +---------+---------+
- | - Hx3 |
- +--------+
-
- We must take into account the carry from Hx0 - Lx3.
+ summation scheme for remaining operations:
+ |_______|_______|_______|_______|_______|
+ |_Hvinf_|_Lvinf_| |_H v0__|_L v0__|
+ | H vm1 | L vm1 |
+ |-H vinf|-L vinf|
+ | H v1 | L v1 |
*/
- cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
- hi = scratch[2*n] + cy;
-
- cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
- hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
-
- hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
-
- /* FIXME: Is support for s + t == n needed? */
- if (LIKELY (s + t > n))
- {
- hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
+ mpn_sub (vm1, vm1, 2 * n + 1, vinf, s + t);
+#if HAVE_NATIVE_mpn_add_nc
+ cy = mpn_add_n (pp + n, pp + n, vm1, n);
+ cy = mpn_add_nc (pp + 2 * n, v1, vm1 + n, n, cy);
+ cy = mpn_add_nc (pp + 3 * n, pp + 3 * n, v1 + n, n, cy);
+ mpn_incr_u (pp + 3 * n, vm1[2 * n]);
+ if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */
+ mpn_incr_u (pp + 4 * n, cy + v1[2 * n]);
+#else
+ cy2 = mpn_add_n (pp + n, pp + n, vm1, n);
+ cy = mpn_add_n (pp + 2 * n, v1, vm1 + n, n);
+ mpn_incr_u (pp + 2 * n, cy2);
+ mpn_incr_u (pp + 3 * n, cy + vm1[2 * n]);
+ cy = mpn_add_n (pp + 3 * n, pp + 3 * n, v1 + n, n);
+ if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */
+ mpn_incr_u (pp + 4 * n, cy + v1[2 * n]);
+#endif
- if (hi < 0)
- MPN_DECR_U (pp + 4*n, s+t-n, -hi);
- else
- MPN_INCR_U (pp + 4*n, s+t-n, hi);
- }
- else
- ASSERT (hi == 0);
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom33_mul.c b/gmp/mpn/generic/toom33_mul.c
index 655355c39a..5fa2813c31 100644
--- a/gmp/mpn/generic/toom33_mul.c
+++ b/gmp/mpn/generic/toom33_mul.c
@@ -1,52 +1,48 @@
-/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
+/* mpn_toom33_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
size. Or more accurately, bn <= an < (3/2)bn.
Contributed to the GNU project by Torbjorn Granlund.
- Additional improvements by Marco Bodrato.
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2008, 2010, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch area.
+ 2. Use new toom functions for the recursive calls.
+*/
#include "gmp.h"
#include "gmp-impl.h"
/* Evaluate in: -1, 0, +1, +2, +inf
- <-s--><--n--><--n-->
- ____ ______ ______
- |_a2_|___a1_|___a0_|
- |b2_|___b1_|___b0_|
- <-t-><--n--><--n-->
+ <-s-><--n--><--n--><--n-->
+ ___ ______ ______ ______
+ |a3_|___a2_|___a1_|___a0_|
+ |_b1_|___b0_|
+ <-t--><--n-->
v0 = a0 * b0 # A(0)*B(0)
v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
@@ -55,33 +51,26 @@ see https://www.gnu.org/licenses/. */
vinf= a2 * b2 # A(inf)*B(inf)
*/
-#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#if TUNE_PROGRAM_BUILD
#define MAYBE_mul_basecase 1
#define MAYBE_mul_toom33 1
#else
#define MAYBE_mul_basecase \
- (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
+ (MUL_TOOM33_THRESHOLD < 3 * MUL_KARATSUBA_THRESHOLD)
#define MAYBE_mul_toom33 \
(MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
#endif
-/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
- multiplication at the infinity point. We may have
- MAYBE_mul_basecase == 0, and still get s just below
- MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
- s == 1 and mpn_toom22_mul will crash.
-*/
-
#define TOOM33_MUL_N_REC(p, a, b, n, ws) \
do { \
if (MAYBE_mul_basecase \
- && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
+ && BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
mpn_mul_basecase (p, a, n, b, n); \
else if (! MAYBE_mul_toom33 \
|| BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
- mpn_toom22_mul (p, a, n, b, n, ws); \
+ mpn_kara_mul_n (p, a, b, n, ws); \
else \
- mpn_toom33_mul (p, a, n, b, n, ws); \
+ mpn_toom3_mul_n (p, a, b, n, ws); \
} while (0)
void
@@ -90,13 +79,13 @@ mpn_toom33_mul (mp_ptr pp,
mp_srcptr bp, mp_size_t bn,
mp_ptr scratch)
{
- const int __gmpn_cpuvec_initialized = 1;
mp_size_t n, s, t;
int vm1_neg;
mp_limb_t cy, vinf0;
mp_ptr gp;
mp_ptr as1, asm1, as2;
mp_ptr bs1, bsm1, bs2;
+ TMP_DECL;
#define a0 ap
#define a1 (ap + n)
@@ -115,34 +104,35 @@ mpn_toom33_mul (mp_ptr pp,
ASSERT (0 < s && s <= n);
ASSERT (0 < t && t <= n);
- as1 = scratch + 4 * n + 4;
- asm1 = scratch + 2 * n + 2;
- as2 = pp + n + 1;
+ TMP_MARK;
+
+ as1 = TMP_SALLOC_LIMBS (n + 1);
+ asm1 = TMP_SALLOC_LIMBS (n + 1);
+ as2 = TMP_SALLOC_LIMBS (n + 1);
- bs1 = pp;
- bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
- bs2 = pp + 2 * n + 2;
+ bs1 = TMP_SALLOC_LIMBS (n + 1);
+ bsm1 = TMP_SALLOC_LIMBS (n + 1);
+ bs2 = TMP_SALLOC_LIMBS (n + 1);
- gp = scratch;
+ gp = pp;
vm1_neg = 0;
/* Compute as1 and asm1. */
cy = mpn_add (gp, a0, n, a2, s);
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
{
- cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
- as1[n] = cy >> 1;
+ cy = mpn_addsub_n (as1, asm1, a1, gp, n);
+ as1[n] = 0;
asm1[n] = 0;
vm1_neg = 1;
}
else
{
- mp_limb_t cy2;
- cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
+ cy2 = mpn_addsub_n (as1, asm1, gp, a1, n);
as1[n] = cy + (cy2 >> 1);
- asm1[n] = cy - (cy2 & 1);
+ asm1[n] = cy - (cy & 1);
}
#else
as1[n] = cy + mpn_add_n (as1, gp, a1, n);
@@ -160,45 +150,36 @@ mpn_toom33_mul (mp_ptr pp,
#endif
/* Compute as2. */
-#if HAVE_NATIVE_mpn_rsblsh1_n
- cy = mpn_add_n (as2, a2, as1, s);
- if (s != n)
- cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
- cy += as1[n];
- cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
-#else
#if HAVE_NATIVE_mpn_addlsh1_n
cy = mpn_addlsh1_n (as2, a1, a2, s);
if (s != n)
cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
#else
- cy = mpn_add_n (as2, a2, as1, s);
+ cy = mpn_lshift (as2, a2, s, 1);
+ cy += mpn_add_n (as2, a1, as2, s);
if (s != n)
- cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
- cy += as1[n];
+ cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
- cy -= mpn_sub_n (as2, as2, a0, n);
-#endif
+ cy += mpn_add_n (as2, a0, as2, n);
#endif
as2[n] = cy;
/* Compute bs1 and bsm1. */
cy = mpn_add (gp, b0, n, b2, t);
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
- bs1[n] = cy >> 1;
+ cy = mpn_addsub_n (bs1, bsm1, b1, gp, n);
+ bs1[n] = 0;
bsm1[n] = 0;
vm1_neg ^= 1;
}
else
{
- mp_limb_t cy2;
- cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
+ cy2 = mpn_addsub_n (bs1, bsm1, gp, b1, n);
bs1[n] = cy + (cy2 >> 1);
- bsm1[n] = cy - (cy2 & 1);
+ bsm1[n] = cy - (cy & 1);
}
#else
bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
@@ -216,26 +197,18 @@ mpn_toom33_mul (mp_ptr pp,
#endif
/* Compute bs2. */
-#if HAVE_NATIVE_mpn_rsblsh1_n
- cy = mpn_add_n (bs2, b2, bs1, t);
- if (t != n)
- cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
- cy += bs1[n];
- cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
-#else
#if HAVE_NATIVE_mpn_addlsh1_n
cy = mpn_addlsh1_n (bs2, b1, b2, t);
if (t != n)
cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
#else
- cy = mpn_add_n (bs2, bs1, b2, t);
+ cy = mpn_lshift (bs2, b2, t, 1);
+ cy += mpn_add_n (bs2, b1, bs2, t);
if (t != n)
- cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
- cy += bs1[n];
+ cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
- cy -= mpn_sub_n (bs2, bs2, b0, n);
-#endif
+ cy += mpn_add_n (bs2, b0, bs2, n);
#endif
bs2[n] = cy;
@@ -251,7 +224,7 @@ mpn_toom33_mul (mp_ptr pp,
#define vinf (pp + 4 * n) /* s+t */
#define vm1 scratch /* 2n+1 */
#define v2 (scratch + 2 * n + 1) /* 2n+2 */
-#define scratch_out (scratch + 5 * n + 5)
+#define scratch_out (scratch + 4 * n + 4)
/* vm1, 2n+1 limbs */
#ifdef SMALLER_RECURSION
@@ -312,5 +285,7 @@ mpn_toom33_mul (mp_ptr pp,
TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
- mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
+ mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0, scratch_out);
+
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom3_sqr.c b/gmp/mpn/generic/toom3_sqr.c
index 6117c67ca6..0c8a4ff74d 100644
--- a/gmp/mpn/generic/toom3_sqr.c
+++ b/gmp/mpn/generic/toom3_sqr.c
@@ -1,77 +1,75 @@
/* mpn_toom3_sqr -- Square {ap,an}.
Contributed to the GNU project by Torbjorn Granlund.
- Additional improvements by Marco Bodrato.
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2010, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1 and asm1 could be
+ avoided by instead reusing the pp area and the scratch area.
+ 2. Use new toom functions for the recursive calls.
+*/
+
#include "gmp.h"
#include "gmp-impl.h"
/* Evaluate in: -1, 0, +1, +2, +inf
- <-s--><--n--><--n-->
- ____ ______ ______
- |_a2_|___a1_|___a0_|
-
- v0 = a0 ^2 # A(0)^2
- v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2
- vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1
- v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6
- vinf= a2 ^2 # A(inf)^2
+ <-s-><--n--><--n--><--n-->
+ ___ ______ ______ ______
+ |a3_|___a2_|___a1_|___a0_|
+ |_b1_|___b0_|
+ <-t--><--n-->
+
+ v0 = a0 * b0 # A(0)*B(0)
+ v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
+ vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
+ v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
+ vinf= a2 * b2 # A(inf)*B(inf)
*/
-#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#if TUNE_PROGRAM_BUILD
#define MAYBE_sqr_basecase 1
#define MAYBE_sqr_toom3 1
#else
#define MAYBE_sqr_basecase \
- (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
+ (SQR_TOOM3_THRESHOLD < 3 * SQR_KARATSUBA_THRESHOLD)
#define MAYBE_sqr_toom3 \
(SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
#endif
-#define TOOM3_SQR_REC(p, a, n, ws) \
+#define TOOM3_SQR_N_REC(p, a, n, ws) \
do { \
if (MAYBE_sqr_basecase \
- && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
+ && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
mpn_sqr_basecase (p, a, n); \
else if (! MAYBE_sqr_toom3 \
|| BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
- mpn_toom2_sqr (p, a, n, ws); \
+ mpn_kara_sqr_n (p, a, n, ws); \
else \
- mpn_toom3_sqr (p, a, n, ws); \
+ mpn_toom3_sqr_n (p, a, n, ws); \
} while (0)
void
@@ -79,11 +77,11 @@ mpn_toom3_sqr (mp_ptr pp,
mp_srcptr ap, mp_size_t an,
mp_ptr scratch)
{
- const int __gmpn_cpuvec_initialized = 1;
mp_size_t n, s;
mp_limb_t cy, vinf0;
mp_ptr gp;
mp_ptr as1, asm1, as2;
+ TMP_DECL;
#define a0 ap
#define a1 (ap + n)
@@ -95,27 +93,28 @@ mpn_toom3_sqr (mp_ptr pp,
ASSERT (0 < s && s <= n);
- as1 = scratch + 4 * n + 4;
- asm1 = scratch + 2 * n + 2;
- as2 = pp + n + 1;
+ TMP_MARK;
+
+ as1 = TMP_SALLOC_LIMBS (n + 1);
+ asm1 = TMP_SALLOC_LIMBS (n + 1);
+ as2 = TMP_SALLOC_LIMBS (n + 1);
- gp = scratch;
+ gp = pp;
/* Compute as1 and asm1. */
cy = mpn_add (gp, a0, n, a2, s);
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
{
- cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
- as1[n] = cy >> 1;
+ cy = mpn_addsub_n (as1, asm1, a1, gp, n);
+ as1[n] = 0;
asm1[n] = 0;
}
else
{
- mp_limb_t cy2;
- cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
+ cy2 = mpn_addsub_n (as1, asm1, gp, a1, n);
as1[n] = cy + (cy2 >> 1);
- asm1[n] = cy - (cy2 & 1);
+ asm1[n] = cy - (cy & 1);
}
#else
as1[n] = cy + mpn_add_n (as1, gp, a1, n);
@@ -132,26 +131,18 @@ mpn_toom3_sqr (mp_ptr pp,
#endif
/* Compute as2. */
-#if HAVE_NATIVE_mpn_rsblsh1_n
- cy = mpn_add_n (as2, a2, as1, s);
- if (s != n)
- cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
- cy += as1[n];
- cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
-#else
#if HAVE_NATIVE_mpn_addlsh1_n
cy = mpn_addlsh1_n (as2, a1, a2, s);
if (s != n)
cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
#else
- cy = mpn_add_n (as2, a2, as1, s);
+ cy = mpn_lshift (as2, a2, s, 1);
+ cy += mpn_add_n (as2, a1, as2, s);
if (s != n)
- cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
- cy += as1[n];
+ cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
- cy -= mpn_sub_n (as2, as2, a0, n);
-#endif
+ cy += mpn_add_n (as2, a0, as2, n);
#endif
as2[n] = cy;
@@ -163,11 +154,11 @@ mpn_toom3_sqr (mp_ptr pp,
#define vinf (pp + 4 * n) /* s+s */
#define vm1 scratch /* 2n+1 */
#define v2 (scratch + 2 * n + 1) /* 2n+2 */
-#define scratch_out (scratch + 5 * n + 5)
+#define scratch_out (scratch + 4 * n + 4)
/* vm1, 2n+1 limbs */
#ifdef SMALLER_RECURSION
- TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
+ TOOM3_SQR_N_REC (vm1, asm1, n, scratch_out);
cy = 0;
if (asm1[n] != 0)
cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
@@ -175,18 +166,18 @@ mpn_toom3_sqr (mp_ptr pp,
cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
vm1[2 * n] = cy;
#else
- TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
+ TOOM3_SQR_N_REC (vm1, asm1, n + 1, scratch_out);
#endif
- TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
+ TOOM3_SQR_N_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
- TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
+ TOOM3_SQR_N_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
vinf0 = vinf[0]; /* v1 overlaps with this */
#ifdef SMALLER_RECURSION
/* v1, 2n+1 limbs */
- TOOM3_SQR_REC (v1, as1, n, scratch_out);
+ TOOM3_SQR_N_REC (v1, as1, n, scratch_out);
if (as1[n] == 1)
{
cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
@@ -216,11 +207,13 @@ mpn_toom3_sqr (mp_ptr pp,
v1[2 * n] = cy;
#else
cy = vinf[1];
- TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
+ TOOM3_SQR_N_REC (v1, as1, n + 1, scratch_out);
vinf[1] = cy;
#endif
- TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
+ TOOM3_SQR_N_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
+
+ mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 1, vinf0, scratch_out);
- mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom42_mul.c b/gmp/mpn/generic/toom42_mul.c
index 9b1e7d491b..981b45df83 100644
--- a/gmp/mpn/generic/toom42_mul.c
+++ b/gmp/mpn/generic/toom42_mul.c
@@ -11,34 +11,32 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2008, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+/*
+ Things to work on:
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch allocation.
+ 2. Apply optimizations also to mul_toom32.c.
+*/
#include "gmp.h"
#include "gmp-impl.h"
@@ -58,9 +56,20 @@ see https://www.gnu.org/licenses/. */
vinf= a3 * b1 # A(inf)*B(inf)
*/
-#define TOOM42_MUL_N_REC(p, a, b, n, ws) \
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_mul_toom22 1
+#else
+#define MAYBE_mul_toom22 \
+ (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD)
+#endif
+
+#define TOOM22_MUL_N_REC(p, a, b, n, ws) \
do { \
- mpn_mul_n (p, a, b, n); \
+ if (! MAYBE_mul_toom22 \
+ || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
+ mpn_mul_basecase (p, a, n, b, n); \
+ else \
+ mpn_toom22_mul (p, a, n, b, n, ws); \
} while (0)
void
@@ -72,7 +81,7 @@ mpn_toom42_mul (mp_ptr pp,
mp_size_t n, s, t;
int vm1_neg;
mp_limb_t cy, vinf0;
- mp_ptr a0_a2;
+ mp_ptr a0_a2, a1_a3;
mp_ptr as1, asm1, as2;
mp_ptr bs1, bsm1, bs2;
TMP_DECL;
@@ -103,9 +112,35 @@ mpn_toom42_mul (mp_ptr pp,
bs2 = TMP_SALLOC_LIMBS (n + 1);
a0_a2 = pp;
+ a1_a3 = pp + n + 1;
/* Compute as1 and asm1. */
- vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
+ a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n);
+ a1_a3[n] = mpn_add (a1_a3, a1, n, a3, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
+ {
+ mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1);
+ vm1_neg = 0;
+ }
+#else
+ mpn_add_n (as1, a0_a2, a1_a3, n + 1);
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
+ {
+ mpn_sub_n (asm1, a1_a3, a0_a2, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_sub_n (asm1, a0_a2, a1_a3, n + 1);
+ vm1_neg = 0;
+ }
+#endif
/* Compute as2. */
#if HAVE_NATIVE_mpn_addlsh1_n
@@ -129,15 +164,15 @@ mpn_toom42_mul (mp_ptr pp,
/* Compute bs1 and bsm1. */
if (t == n)
{
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (mpn_cmp (b0, b1, n) < 0)
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
+ cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
vm1_neg ^= 1;
}
else
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
+ cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
}
bs1[n] = cy >> 1;
#else
@@ -185,16 +220,16 @@ mpn_toom42_mul (mp_ptr pp,
#define vinf (pp + 4 * n) /* s+t */
#define vm1 scratch /* 2n+1 */
#define v2 (scratch + 2 * n + 1) /* 2n+2 */
-#define scratch_out scratch + 4 * n + 4 /* Currently unused. */
+#define scratch_out scratch + 4 * n + 4
/* vm1, 2n+1 limbs */
- TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
+ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
cy = 0;
if (asm1[n] != 0)
cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
vm1[2 * n] = cy;
- TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
+ TOOM22_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
/* vinf, s+t limbs */
if (s > t) mpn_mul (vinf, a3, s, b1, t);
@@ -203,7 +238,7 @@ mpn_toom42_mul (mp_ptr pp,
vinf0 = vinf[0]; /* v1 overlaps with this */
/* v1, 2n+1 limbs */
- TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
+ TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out);
if (as1[n] == 1)
{
cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
@@ -226,9 +261,9 @@ mpn_toom42_mul (mp_ptr pp,
cy += mpn_add_n (v1 + n, v1 + n, as1, n);
v1[2 * n] = cy;
- TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
+ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
- mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
+ mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0, scratch + 4 * n + 4);
TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom42_mulmid.c b/gmp/mpn/generic/toom42_mulmid.c
deleted file mode 100644
index 0251a6d7ed..0000000000
--- a/gmp/mpn/generic/toom42_mulmid.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/* mpn_toom42_mulmid -- toom42 middle product
-
- Contributed by David Harvey.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-
-/*
- Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
-
- Neither ap nor bp may overlap rp.
-
- Must have n >= 4.
-
- Amount of scratch space required is given by mpn_toom42_mulmid_itch().
-
- FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
- requirements should be clarified.
-*/
-void
-mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
- mp_ptr scratch)
-{
- mp_limb_t cy, e[12], zh, zl;
- mp_size_t m;
- int neg;
-
- ASSERT (n >= 4);
- ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
- ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
-
- ap += n & 1; /* handle odd row and diagonal later */
- m = n / 2;
-
- /* (e0h:e0l) etc are correction terms, in 2's complement */
-#define e0l (e[0])
-#define e0h (e[1])
-#define e1l (e[2])
-#define e1h (e[3])
-#define e2l (e[4])
-#define e2h (e[5])
-#define e3l (e[6])
-#define e3h (e[7])
-#define e4l (e[8])
-#define e4h (e[9])
-#define e5l (e[10])
-#define e5h (e[11])
-
-#define s (scratch + 2)
-#define t (rp + m + 2)
-#define p0 rp
-#define p1 scratch
-#define p2 (rp + m)
-#define next_scratch (scratch + 3*m + 1)
-
- /*
- rp scratch
- |---------|-----------| |---------|---------|----------|
- 0 m 2m+2 0 m 2m 3m+1
- <----p2----> <-------------s------------->
- <----p0----><---t----> <----p1---->
- */
-
- /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
- cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
- cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
- bp + m, bp, m, cy);
- mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
-
- /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
- if (mpn_cmp (bp + m, bp, m) < 0)
- {
- ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
- ap + m - 1, ap + 2*m - 1, m, 0));
- neg = 1;
- }
- else
- {
- ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
- ap + m - 1, ap + 2*m - 1, m, 0));
- neg = 0;
- }
-
- /* recursive middle products. The picture is:
-
- b[2m-1] A A A B B B - - - - -
- ... - A A A B B B - - - -
- b[m] - - A A A B B B - - -
- b[m-1] - - - C C C D D D - -
- ... - - - - C C C D D D -
- b[0] - - - - - C C C D D D
- a[0] ... a[m] ... a[2m] ... a[4m-2]
- */
-
- if (m < MULMID_TOOM42_THRESHOLD)
- {
- /* A + B */
- mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
- /* accumulate high limbs of p0 into e1 */
- ADDC_LIMB (cy, e1l, e1l, p0[m]);
- e1h += p0[m + 1] + cy;
- /* (-1)^neg * (B - C) (overwrites first m limbs of s) */
- mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
- /* C + D (overwrites t) */
- mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
- }
- else
- {
- /* as above, but use toom42 instead */
- mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
- ADDC_LIMB (cy, e1l, e1l, p0[m]);
- e1h += p0[m + 1] + cy;
- mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
- mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
- }
-
- /* apply error terms */
-
- /* -e0 at rp[0] */
- SUBC_LIMB (cy, rp[0], rp[0], e0l);
- SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
- if (UNLIKELY (cy))
- {
- cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
- SUBC_LIMB (cy, e1l, e1l, cy);
- e1h -= cy;
- }
-
- /* z = e1 - e2 + high(p0) */
- SUBC_LIMB (cy, zl, e1l, e2l);
- zh = e1h - e2h - cy;
-
- /* z at rp[m] */
- ADDC_LIMB (cy, rp[m], rp[m], zl);
- zh = (zh + cy) & GMP_NUMB_MASK;
- ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
- cy -= (zh >> (GMP_NUMB_BITS - 1));
- if (UNLIKELY (cy))
- {
- if (cy == 1)
- mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
- else /* cy == -1 */
- mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
- }
-
- /* e3 at rp[2*m] */
- ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
- rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
-
- /* e4 at p1[0] */
- ADDC_LIMB (cy, p1[0], p1[0], e4l);
- ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
- if (UNLIKELY (cy))
- mpn_add_1 (p1 + 2, p1 + 2, m, 1);
-
- /* -e5 at p1[m] */
- SUBC_LIMB (cy, p1[m], p1[m], e5l);
- p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
-
- /* adjustment if p1 ends up negative */
- cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
-
- /* add (-1)^neg * (p1 - B^m * p1) to output */
- if (neg)
- {
- mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
- mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
- mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */
- }
- else
- {
- mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
- mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
- mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */
- }
-
- /* odd row and diagonal */
- if (n & 1)
- {
- /*
- Products marked E are already done. We need to do products marked O.
-
- OOOOO----
- -EEEEO---
- --EEEEO--
- ---EEEEO-
- ----EEEEO
- */
-
- /* first row of O's */
- cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
- ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
-
- /* O's on diagonal */
- /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
- that can handle the sum below. Currently we're relying on
- mulmid_basecase being pretty fast for a diagonal sum like this,
- which is true at least for the K8 asm version, but surely false
- for the generic version. */
- mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
- mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
- }
-}
diff --git a/gmp/mpn/generic/toom43_mul.c b/gmp/mpn/generic/toom43_mul.c
deleted file mode 100644
index 59d45576b8..0000000000
--- a/gmp/mpn/generic/toom43_mul.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
- times as large as bn. Or more accurately, bn < an < 2 bn.
-
- Contributed to the GNU project by Marco Bodrato.
-
- The idea of applying toom to unbalanced multiplication is due to Marco
- Bodrato and Alberto Zanoni.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Evaluate in: -2, -1, 0, +1, +2, +inf
-
- <-s-><--n--><--n--><--n-->
- ___ ______ ______ ______
- |a3_|___a2_|___a1_|___a0_|
- |_b2_|___b1_|___b0_|
- <-t--><--n--><--n-->
-
- v0 = a0 * b0 # A(0)*B(0)
- v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2
- vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1
- v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6
- vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4
- vinf= a3 * b2 # A(inf)*B(inf)
-*/
-
-void
-mpn_toom43_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- enum toom6_flags flags;
- mp_limb_t cy;
-
-#define a0 ap
-#define a1 (ap + n)
-#define a2 (ap + 2 * n)
-#define a3 (ap + 3 * n)
-#define b0 bp
-#define b1 (bp + n)
-#define b2 (bp + 2 * n)
-
- n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
-
- s = an - 3 * n;
- t = bn - 2 * n;
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
-
- /* This is true whenever an >= 25 or bn >= 19, I think. It
- guarantees that we can fit 5 values of size n+1 in the product
- area. */
- ASSERT (s+t >= 5);
-
-#define v0 pp /* 2n */
-#define vm1 (scratch) /* 2n+1 */
-#define v1 (pp + 2*n) /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define v2 (scratch + 4 * n + 2) /* 2n+1 */
-#define vinf (pp + 5 * n) /* s+t */
-#define bs1 pp /* n+1 */
-#define bsm1 (scratch + 2 * n + 2) /* n+1 */
-#define asm1 (scratch + 3 * n + 3) /* n+1 */
-#define asm2 (scratch + 4 * n + 4) /* n+1 */
-#define bsm2 (pp + n + 1) /* n+1 */
-#define bs2 (pp + 2 * n + 2) /* n+1 */
-#define as2 (pp + 3 * n + 3) /* n+1 */
-#define as1 (pp + 4 * n + 4) /* n+1 */
-
- /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
- limb, because products will overwrite 2n+2 limbs. */
-
-#define a0a2 scratch
-#define b0b2 scratch
-#define a1a3 asm1
-#define b1d bsm1
-
- /* Compute as2 and asm2. */
- flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
-
- /* Compute bs2 and bsm2. */
- b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */
- cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */
- cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */
- if (t != n)
- cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
- b0b2[n] = cy;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (mpn_cmp (b0b2, b1d, n+1) < 0)
- {
- mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
- flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
- }
- else
- {
- mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
- }
-#else
- mpn_add_n (bs2, b0b2, b1d, n+1);
- if (mpn_cmp (b0b2, b1d, n+1) < 0)
- {
- mpn_sub_n (bsm2, b1d, b0b2, n+1);
- flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
- }
- else
- {
- mpn_sub_n (bsm2, b0b2, b1d, n+1);
- }
-#endif
-
- /* Compute as1 and asm1. */
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2));
-
- /* Compute bs1 and bsm1. */
- bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
- {
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
- bs1[n] = cy >> 1;
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
- }
- else
- {
- cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
- bs1[n] = bsm1[n] + (cy >> 1);
- bsm1[n]-= cy & 1;
- }
-#else
- bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
- if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
- {
- mpn_sub_n (bsm1, b1, bsm1, n);
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
- }
- else
- {
- bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
- }
-#endif
-
- ASSERT (as1[n] <= 3);
- ASSERT (bs1[n] <= 2);
- ASSERT (asm1[n] <= 1);
- ASSERT (bsm1[n] <= 1);
- ASSERT (as2[n] <=14);
- ASSERT (bs2[n] <= 6);
- ASSERT (asm2[n] <= 9);
- ASSERT (bsm2[n] <= 4);
-
- /* vm1, 2n+1 limbs */
- mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */
-
- /* vm2, 2n+1 limbs */
- mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
-
- /* v2, 2n+1 limbs */
- mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
-
- /* v1, 2n+1 limbs */
- mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
-
- /* vinf, s+t limbs */ /* W0 */
- if (s > t) mpn_mul (vinf, a3, s, b2, t);
- else mpn_mul (vinf, b2, t, a3, s);
-
- /* v0, 2n limbs */
- mpn_mul_n (v0, ap, bp, n); /* W5 */
-
- mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
-
-#undef v0
-#undef vm1
-#undef v1
-#undef vm2
-#undef v2
-#undef vinf
-#undef bs1
-#undef bs2
-#undef bsm1
-#undef bsm2
-#undef asm1
-#undef asm2
-/* #undef as1 */
-/* #undef as2 */
-#undef a0a2
-#undef b0b2
-#undef a1a3
-#undef b1d
-#undef a0
-#undef a1
-#undef a2
-#undef a3
-#undef b0
-#undef b1
-#undef b2
-}
diff --git a/gmp/mpn/generic/toom44_mul.c b/gmp/mpn/generic/toom44_mul.c
index 5abf2d14a9..37ff45279d 100644
--- a/gmp/mpn/generic/toom44_mul.c
+++ b/gmp/mpn/generic/toom44_mul.c
@@ -7,39 +7,36 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2008, 2013 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch area.
+ 2. Use new toom functions for the recursive calls.
+*/
#include "gmp.h"
#include "gmp-impl.h"
-/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
+/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
<-s--><--n--><--n--><--n-->
____ ______ ______ ______
@@ -51,8 +48,8 @@ see https://www.gnu.org/licenses/. */
v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3
vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1
v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14
- vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9
vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14
+ vmh = (8a0-4a1+2a2- a3)*(8b0-4b1+2b2- b3) # A(-1/2)*B(-1/2) -4<=ah<=9 -4<=bh<=9
vinf= a3 * b2 # A(inf)*B(inf)
*/
@@ -62,51 +59,28 @@ see https://www.gnu.org/licenses/. */
#define MAYBE_mul_toom44 1
#else
#define MAYBE_mul_basecase \
- (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
+ (MUL_TOOM44_THRESHOLD < 4 * MUL_KARATSUBA_THRESHOLD)
#define MAYBE_mul_toom22 \
(MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
#define MAYBE_mul_toom44 \
- (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
+ (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
#endif
#define TOOM44_MUL_N_REC(p, a, b, n, ws) \
do { \
if (MAYBE_mul_basecase \
- && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
+ && BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
mpn_mul_basecase (p, a, n, b, n); \
else if (MAYBE_mul_toom22 \
&& BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
- mpn_toom22_mul (p, a, n, b, n, ws); \
+ mpn_kara_mul_n (p, a, b, n, ws); \
else if (! MAYBE_mul_toom44 \
|| BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \
- mpn_toom33_mul (p, a, n, b, n, ws); \
+ mpn_toom3_mul_n (p, a, b, n, ws); \
else \
mpn_toom44_mul (p, a, n, b, n, ws); \
} while (0)
-/* Use of scratch space. In the product area, we store
-
- ___________________
- |vinf|____|_v1_|_v0_|
- s+t 2n-1 2n+1 2n
-
- The other recursive products, vm1, v2, vm2, vh are stored in the
- scratch area. When computing them, we use the product area for
- intermediate values.
-
- Next, we compute v1. We can store the intermediate factors at v0
- and at vh + 2n + 2.
-
- Finally, for v0 and vinf, factors are parts of the input operands,
- and we need scratch space only for the recursive multiplication.
-
- In all, if S(an) is the scratch need, the needed space is bounded by
-
- S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
-
- which should give S(n) = 8 n/3 + c log(n) for some constant c.
-*/
-
void
mpn_toom44_mul (mp_ptr pp,
mp_srcptr ap, mp_size_t an,
@@ -115,7 +89,11 @@ mpn_toom44_mul (mp_ptr pp,
{
mp_size_t n, s, t;
mp_limb_t cy;
- enum toom7_flags flags;
+ mp_ptr gp, hp;
+ mp_ptr as1, asm1, as2, ash, asmh;
+ mp_ptr bs1, bsm1, bs2, bsh, bsmh;
+ enum toom4_flags flags;
+ TMP_DECL;
#define a0 ap
#define a1 (ap + n)
@@ -126,111 +104,227 @@ mpn_toom44_mul (mp_ptr pp,
#define b2 (bp + 2*n)
#define b3 (bp + 3*n)
- ASSERT (an >= bn);
-
n = (an + 3) >> 2;
s = an - 3 * n;
t = bn - 3 * n;
+ ASSERT (an >= bn);
+
ASSERT (0 < s && s <= n);
ASSERT (0 < t && t <= n);
- ASSERT (s >= t);
-
- /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
- * following limb, so these must be computed in order, and we need a
- * one limb gap to tp. */
-#define v0 pp /* 2n */
-#define v1 (pp + 2 * n) /* 2n+1 */
-#define vinf (pp + 6 * n) /* s+t */
-#define v2 scratch /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define vh (scratch + 4 * n + 2) /* 2n+1 */
-#define vm1 (scratch + 6 * n + 3) /* 2n+1 */
-#define tp (scratch + 8*n + 5)
-
- /* apx and bpx must not overlap with v1 */
-#define apx pp /* n+1 */
-#define amx (pp + n + 1) /* n+1 */
-#define bmx (pp + 2*n + 2) /* n+1 */
-#define bpx (pp + 4*n + 2) /* n+1 */
- /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
- gives roughly 32 n/3 + log term. */
-
- /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */
- flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));
-
- /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */
- flags = (enum toom7_flags) (flags ^ toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp));
+ TMP_MARK;
+
+ as1 = TMP_ALLOC_LIMBS (10 * n + 10);
+ asm1 = as1 + n + 1;
+ as2 = asm1 + n + 1;
+ ash = as2 + n + 1;
+ asmh = ash + n + 1;
+ bs1 = asmh + n + 1;
+ bsm1 = bs1 + n + 1;
+ bs2 = bsm1 + n + 1;
+ bsh = bs2 + n + 1;
+ bsmh = bsh + n + 1;
+
+ gp = pp;
+ hp = pp + n + 1;
+
+ flags = 0;
+
+ /* Compute as1 and asm1. */
+ gp[n] = mpn_add_n (gp, a0, a2, n);
+ hp[n] = mpn_add (hp, a1, n, a3, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_addsub_n (as1, asm1, hp, gp, n + 1);
+ flags ^= toom4_w3_neg;
+ }
+ else
+ {
+ mpn_addsub_n (as1, asm1, gp, hp, n + 1);
+ }
+#else
+ mpn_add_n (as1, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asm1, hp, gp, n + 1);
+ flags ^= toom4_w3_neg;
+ }
+ else
+ {
+ mpn_sub_n (asm1, gp, hp, n + 1);
+ }
+#endif
- TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */
- TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */
+ /* Compute as2. */
+#if HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n (as2, a2, a3, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+ cy = mpn_lshift (as2, a3, s, 1);
+ cy += mpn_add_n (as2, a2, as2, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a0, as2, n);
+#endif
+ as2[n] = cy;
- /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
+ /* Compute ash and asmh. */
+ cy = mpn_lshift (gp, a0, n, 3); /* 8a0 */
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (apx, a1, a0, n);
- cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
- if (s < n)
+ gp[n] = cy + mpn_addlsh1_n (gp, gp, a2, n); /* 8a0 + 2a2 */
+#else
+ cy += mpn_lshift (hp, a2, n, 1); /* 2a2 */
+ gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8a0 + 2a2 */
+#endif
+ cy = mpn_lshift (hp, a1, n, 2); /* 4a1 */
+ hp[n] = cy + mpn_add (hp, hp, n, a3, s); /* 4a1 + a3 */
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (apx, a3, apx, s);
- apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
- MPN_INCR_U (apx + s, n+1-s, cy2);
+ mpn_addsub_n (ash, asmh, hp, gp, n + 1);
+ flags ^= toom4_w1_neg;
}
else
- apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
+ {
+ mpn_addsub_n (ash, asmh, gp, hp, n + 1);
+ }
#else
- cy = mpn_lshift (apx, a0, n, 1);
- cy += mpn_add_n (apx, apx, a1, n);
- cy = 2*cy + mpn_lshift (apx, apx, n, 1);
- cy += mpn_add_n (apx, apx, a2, n);
- cy = 2*cy + mpn_lshift (apx, apx, n, 1);
- apx[n] = cy + mpn_add (apx, apx, n, a3, s);
+ mpn_add_n (ash, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asmh, hp, gp, n + 1);
+ flags ^= toom4_w1_neg;
+ }
+ else
+ {
+ mpn_sub_n (asmh, gp, hp, n + 1);
+ }
+#endif
+
+ /* Compute bs1 and bsm1. */
+ gp[n] = mpn_add_n (gp, b0, b2, n);
+ hp[n] = mpn_add (hp, b1, n, b3, t);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_addsub_n (bs1, bsm1, hp, gp, n + 1);
+ flags ^= toom4_w3_neg;
+ }
+ else
+ {
+ mpn_addsub_n (bs1, bsm1, gp, hp, n + 1);
+ }
+#else
+ mpn_add_n (bs1, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (bsm1, hp, gp, n + 1);
+ flags ^= toom4_w3_neg;
+ }
+ else
+ {
+ mpn_sub_n (bsm1, gp, hp, n + 1);
+ }
#endif
- /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
+ /* Compute bs2. */
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (bpx, b1, b0, n);
- cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
- if (t < n)
+ cy = mpn_addlsh1_n (bs2, b2, b3, t);
+ if (t != n)
+ cy = mpn_add_1 (bs2 + t, b2 + t, n - t, cy);
+ cy = 2 * cy + mpn_addlsh1_n (bs2, b1, bs2, n);
+ cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
+#else
+ cy = mpn_lshift (bs2, b3, t, 1);
+ cy += mpn_add_n (bs2, b2, bs2, t);
+ if (t != n)
+ cy = mpn_add_1 (bs2 + t, b2 + t, n - t, cy);
+ cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
+ cy += mpn_add_n (bs2, b1, bs2, n);
+ cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
+ cy += mpn_add_n (bs2, b0, bs2, n);
+#endif
+ bs2[n] = cy;
+
+ /* Compute bsh and bsmh. */
+ cy = mpn_lshift (gp, b0, n, 3); /* 8b0 */
+#if HAVE_NATIVE_mpn_addlsh1_n
+ gp[n] = cy + mpn_addlsh1_n (gp, gp, b2, n); /* 8b0 + 2b2 */
+#else
+ cy += mpn_lshift (hp, b2, n, 1); /* 2b2 */
+ gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8b0 + 2b2 */
+#endif
+ cy = mpn_lshift (hp, b1, n, 2); /* 4b1 */
+ hp[n] = cy + mpn_add (hp, hp, n, b3, t); /* 4b1 + b3 */
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
- bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
- MPN_INCR_U (bpx + t, n+1-t, cy2);
+ mpn_addsub_n (bsh, bsmh, hp, gp, n + 1);
+ flags ^= toom4_w1_neg;
}
else
- bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
+ {
+ mpn_addsub_n (bsh, bsmh, gp, hp, n + 1);
+ }
#else
- cy = mpn_lshift (bpx, b0, n, 1);
- cy += mpn_add_n (bpx, bpx, b1, n);
- cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
- cy += mpn_add_n (bpx, bpx, b2, n);
- cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
- bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
+ mpn_add_n (bsh, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (bsmh, hp, gp, n + 1);
+ flags ^= toom4_w1_neg;
+ }
+ else
+ {
+ mpn_sub_n (bsmh, gp, hp, n + 1);
+ }
#endif
- ASSERT (apx[n] < 15);
- ASSERT (bpx[n] < 15);
+ ASSERT (as1[n] <= 3);
+ ASSERT (bs1[n] <= 3);
+ ASSERT (asm1[n] <= 1);
+ ASSERT (bsm1[n] <= 1);
+ ASSERT (as2[n] <= 14);
+ ASSERT (bs2[n] <= 14);
+ ASSERT (ash[n] <= 14);
+ ASSERT (bsh[n] <= 14);
+ ASSERT (asmh[n] <= 9);
+ ASSERT (bsmh[n] <= 9);
+
+#define v0 pp /* 2n */
+#define v1 (scratch + 6 * n + 6) /* 2n+1 */
+#define vm1 scratch /* 2n+1 */
+#define v2 (scratch + 2 * n + 2) /* 2n+1 */
+#define vinf (pp + 6 * n) /* s+t */
+#define vh (pp + 2 * n) /* 2n+1 */
+#define vmh (scratch + 4 * n + 4)
+#define scratch_out (scratch + 8 * n + 8)
+
+ /* vm1, 2n+1 limbs */
+ TOOM44_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); /* vm1, 2n+1 limbs */
- TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */
+ TOOM44_MUL_N_REC (v2 , as2 , bs2 , n + 1, scratch_out); /* v2, 2n+1 limbs */
- /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */
- flags = (enum toom7_flags) (flags | toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp));
+ if (s > t) mpn_mul (vinf, a3, s, b3, t);
+ else TOOM44_MUL_N_REC (vinf, a3, b3, s, scratch_out); /* vinf, s+t limbs */
- /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3. */
- flags = (enum toom7_flags) (flags ^ toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp));
+ TOOM44_MUL_N_REC (v1 , as1 , bs1 , n + 1, scratch_out); /* v1, 2n+1 limbs */
- TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */
- /* Clobbers amx, bmx. */
- TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */
+ TOOM44_MUL_N_REC (vh , ash , bsh , n + 1, scratch_out);
- TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
- if (s > t)
- mpn_mul (vinf, a3, s, b3, t);
- else
- TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */
+ TOOM44_MUL_N_REC (vmh, asmh, bsmh, n + 1, scratch_out);
+
+ TOOM44_MUL_N_REC (v0 , ap , bp , n , scratch_out); /* v0, 2n limbs */
+
+ mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch_out);
- mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom4_sqr.c b/gmp/mpn/generic/toom4_sqr.c
index b4154ba83f..911b5548d7 100644
--- a/gmp/mpn/generic/toom4_sqr.c
+++ b/gmp/mpn/generic/toom4_sqr.c
@@ -6,34 +6,31 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2010, 2013 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch area.
+ 2. Use new toom functions for the recursive calls.
+*/
#include "gmp.h"
#include "gmp-impl.h"
@@ -43,14 +40,16 @@ see https://www.gnu.org/licenses/. */
<-s--><--n--><--n--><--n-->
____ ______ ______ ______
|_a3_|___a2_|___a1_|___a0_|
-
- v0 = a0 ^2 # A(0)^2
- v1 = ( a0+ a1+ a2+ a3)^2 # A(1)^2 ah <= 3
- vm1 = ( a0- a1+ a2- a3)^2 # A(-1)^2 |ah| <= 1
- v2 = ( a0+2a1+4a2+8a3)^2 # A(2)^2 ah <= 14
- vh = (8a0+4a1+2a2+ a3)^2 # A(1/2)^2 ah <= 14
- vmh = (8a0-4a1+2a2- a3)^2 # A(-1/2)^2 -4<=ah<=9
- vinf= a3 ^2 # A(inf)^2
+ |b3_|___b2_|___b1_|___b0_|
+ <-t-><--n--><--n--><--n-->
+
+ v0 = a0 * b0 # A(0)*B(0)
+ v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3
+ vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1
+ v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14
+ vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14
+ vmh = (8a0-4a1+2a2- a3)*(8b0-4b1+2b2- b3) # A(-1/2)*B(-1/2) -4<=ah<=9 -4<=bh<=9
+ vinf= a3 * b2 # A(inf)*B(inf)
*/
#if TUNE_PROGRAM_BUILD
@@ -59,24 +58,24 @@ see https://www.gnu.org/licenses/. */
#define MAYBE_sqr_toom4 1
#else
#define MAYBE_sqr_basecase \
- (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD)
+ (SQR_TOOM4_THRESHOLD < 4 * SQR_KARATSUBA_THRESHOLD)
#define MAYBE_sqr_toom2 \
(SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD)
#define MAYBE_sqr_toom4 \
- (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
+ (SQR_FFT_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
#endif
-#define TOOM4_SQR_REC(p, a, n, ws) \
+#define TOOM4_SQR_N_REC(p, a, n, ws) \
do { \
if (MAYBE_sqr_basecase \
- && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
+ && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
mpn_sqr_basecase (p, a, n); \
else if (MAYBE_sqr_toom2 \
&& BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
- mpn_toom2_sqr (p, a, n, ws); \
+ mpn_kara_sqr_n (p, a, n, ws); \
else if (! MAYBE_sqr_toom4 \
|| BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \
- mpn_toom3_sqr (p, a, n, ws); \
+ mpn_toom3_sqr_n (p, a, n, ws); \
else \
mpn_toom4_sqr (p, a, n, ws); \
} while (0)
@@ -88,6 +87,9 @@ mpn_toom4_sqr (mp_ptr pp,
{
mp_size_t n, s;
mp_limb_t cy;
+ mp_ptr gp, hp;
+ mp_ptr as1, asm1, as2, ash, asmh;
+ TMP_DECL;
#define a0 ap
#define a1 (ap + n)
@@ -100,65 +102,122 @@ mpn_toom4_sqr (mp_ptr pp,
ASSERT (0 < s && s <= n);
- /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
- * following limb, so these must be computed in order, and we need a
- * one limb gap to tp. */
-#define v0 pp /* 2n */
-#define v1 (pp + 2 * n) /* 2n+1 */
-#define vinf (pp + 6 * n) /* s+t */
-#define v2 scratch /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define vh (scratch + 4 * n + 2) /* 2n+1 */
-#define vm1 (scratch + 6 * n + 3) /* 2n+1 */
-#define tp (scratch + 8*n + 5)
+ TMP_MARK;
- /* No overlap with v1 */
-#define apx pp /* n+1 */
-#define amx (pp + 4*n + 2) /* n+1 */
+ as1 = TMP_SALLOC_LIMBS (n + 1);
+ asm1 = TMP_SALLOC_LIMBS (n + 1);
+ as2 = TMP_SALLOC_LIMBS (n + 1);
+ ash = TMP_SALLOC_LIMBS (n + 1);
+ asmh = TMP_SALLOC_LIMBS (n + 1);
- /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
- gives roughly 32 n/3 + log term. */
+ gp = pp;
+ hp = pp + n + 1;
- /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */
- mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
+ /* Compute as1 and asm1. */
+ gp[n] = mpn_add_n (gp, a0, a2, n);
+ hp[n] = mpn_add (hp, a1, n, a3, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_addsub_n (as1, asm1, hp, gp, n + 1);
+ }
+ else
+ {
+ mpn_addsub_n (as1, asm1, gp, hp, n + 1);
+ }
+#else
+ mpn_add_n (as1, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asm1, hp, gp, n + 1);
+ }
+ else
+ {
+ mpn_sub_n (asm1, gp, hp, n + 1);
+ }
+#endif
- TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */
- TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */
+ /* Compute as2. */
+#if HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n (as2, a2, a3, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+ cy = mpn_lshift (as2, a3, s, 1);
+ cy += mpn_add_n (as2, a2, as2, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a0, as2, n);
+#endif
+ as2[n] = cy;
- /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
+ /* Compute ash and asmh. */
+ cy = mpn_lshift (gp, a0, n, 3); /* 8a0 */
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (apx, a1, a0, n);
- cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
- if (s < n)
+ gp[n] = cy + mpn_addlsh1_n (gp, gp, a2, n); /* 8a0 + 2a2 */
+#else
+ cy += mpn_lshift (hp, a2, n, 1); /* 2a2 */
+ gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8a0 + 2a2 */
+#endif
+ cy = mpn_lshift (hp, a1, n, 2); /* 4a1 */
+ hp[n] = cy + mpn_add (hp, hp, n, a3, s); /* 4a1 + a3 */
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (apx, a3, apx, s);
- apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
- MPN_INCR_U (apx + s, n+1-s, cy2);
+ mpn_addsub_n (ash, asmh, hp, gp, n + 1);
}
else
- apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
+ {
+ mpn_addsub_n (ash, asmh, gp, hp, n + 1);
+ }
#else
- cy = mpn_lshift (apx, a0, n, 1);
- cy += mpn_add_n (apx, apx, a1, n);
- cy = 2*cy + mpn_lshift (apx, apx, n, 1);
- cy += mpn_add_n (apx, apx, a2, n);
- cy = 2*cy + mpn_lshift (apx, apx, n, 1);
- apx[n] = cy + mpn_add (apx, apx, n, a3, s);
+ mpn_add_n (ash, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asmh, hp, gp, n + 1);
+ }
+ else
+ {
+ mpn_sub_n (asmh, gp, hp, n + 1);
+ }
#endif
- ASSERT (apx[n] < 15);
+ ASSERT (as1[n] <= 3);
+ ASSERT (asm1[n] <= 1);
+ ASSERT (as2[n] <= 14);
+ ASSERT (ash[n] <= 14);
+ ASSERT (asmh[n] <= 9);
+
+#define v0 pp /* 2n */
+#define v1 (scratch + 6 * n + 6) /* 2n+1 */
+#define vm1 scratch /* 2n+1 */
+#define v2 (scratch + 2 * n + 2) /* 2n+1 */
+#define vinf (pp + 6 * n) /* s+t */
+#define vh (pp + 2 * n) /* 2n+1 */
+#define vmh (scratch + 4 * n + 4)
+#define scratch_out (scratch + 8 * n + 8)
+
+ /* vm1, 2n+1 limbs */
+ TOOM4_SQR_N_REC (vm1, asm1, n + 1, scratch_out); /* vm1, 2n+1 limbs */
+
+ TOOM4_SQR_N_REC (v2 , as2 , n + 1, scratch_out); /* v2, 2n+1 limbs */
+
+ TOOM4_SQR_N_REC (vinf, a3 , s, scratch_out); /* vinf, 2s limbs */
+
+ TOOM4_SQR_N_REC (v1 , as1 , n + 1, scratch_out); /* v1, 2n+1 limbs */
- TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */
+ TOOM4_SQR_N_REC (vh , ash , n + 1, scratch_out);
- /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */
- mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
+ TOOM4_SQR_N_REC (vmh, asmh, n + 1, scratch_out);
- TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */
- TOOM4_SQR_REC (vm1, amx, n + 1, tp); /* vm1, 2n+1 limbs */
+ TOOM4_SQR_N_REC (v0 , ap , n , scratch_out); /* v0, 2n limbs */
- TOOM4_SQR_REC (v0, a0, n, tp);
- TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */
+ mpn_toom_interpolate_7pts (pp, n, 0, vmh, vm1, v1, v2, s + s, scratch_out);
- mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp);
+ TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom52_mul.c b/gmp/mpn/generic/toom52_mul.c
deleted file mode 100644
index e15b5833aa..0000000000
--- a/gmp/mpn/generic/toom52_mul.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
- times as large as bn. Or more accurately, bn < an < 2 bn.
-
- Contributed to the GNU project by Marco Bodrato.
-
- The idea of applying toom to unbalanced multiplication is due to Marco
- Bodrato and Alberto Zanoni.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Evaluate in: -2, -1, 0, +1, +2, +inf
-
- <-s-><--n--><--n--><--n--><--n-->
- ___ ______ ______ ______ ______
- |a4_|___a3_|___a2_|___a1_|___a0_|
- |b1|___b0_|
- <t-><--n-->
-
- v0 = a0 * b0 # A(0)*B(0)
- v1 = (a0+ a1+ a2+ a3+ a4)*(b0+ b1) # A(1)*B(1) ah <= 4 bh <= 1
- vm1 = (a0- a1+ a2- a3+ a4)*(b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
- v2 = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) # A(2)*B(2) ah <= 30 bh <= 2
- vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) # A(-2)*B(-2) |ah| <= 20 |bh|<= 1
- vinf= a4 * b1 # A(inf)*B(inf)
-
- Some slight optimization in evaluation are taken from the paper:
- "Towards Optimal Toom-Cook Multiplication for Univariate and
- Multivariate Polynomials in Characteristic 2 and 0."
-*/
-
-void
-mpn_toom52_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- enum toom6_flags flags;
-
-#define a0 ap
-#define a1 (ap + n)
-#define a2 (ap + 2 * n)
-#define a3 (ap + 3 * n)
-#define a4 (ap + 4 * n)
-#define b0 bp
-#define b1 (bp + n)
-
- n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1);
-
- s = an - 4 * n;
- t = bn - n;
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
-
- /* Ensures that 5 values of n+1 limbs each fits in the product area.
- Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9,
- n = 8. */
- ASSERT (s+t >= 5);
-
-#define v0 pp /* 2n */
-#define vm1 (scratch) /* 2n+1 */
-#define v1 (pp + 2 * n) /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define v2 (scratch + 4 * n + 2) /* 2n+1 */
-#define vinf (pp + 5 * n) /* s+t */
-#define bs1 pp /* n+1 */
-#define bsm1 (scratch + 2 * n + 2) /* n */
-#define asm1 (scratch + 3 * n + 3) /* n+1 */
-#define asm2 (scratch + 4 * n + 4) /* n+1 */
-#define bsm2 (pp + n + 1) /* n+1 */
-#define bs2 (pp + 2 * n + 2) /* n+1 */
-#define as2 (pp + 3 * n + 3) /* n+1 */
-#define as1 (pp + 4 * n + 4) /* n+1 */
-
- /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because
- products will overwrite 2n+2 limbs. */
-
-#define a0a2 scratch
-#define a1a3 asm1
-
- /* Compute as2 and asm2. */
- flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3));
-
- /* Compute bs1 and bsm1. */
- if (t == n)
- {
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mp_limb_t cy;
-
- if (mpn_cmp (b0, b1, n) < 0)
- {
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
- }
- else
- {
- cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
- }
- bs1[n] = cy >> 1;
-#else
- bs1[n] = mpn_add_n (bs1, b0, b1, n);
- if (mpn_cmp (b0, b1, n) < 0)
- {
- mpn_sub_n (bsm1, b1, b0, n);
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
- }
- else
- {
- mpn_sub_n (bsm1, b0, b1, n);
- }
-#endif
- }
- else
- {
- bs1[n] = mpn_add (bs1, b0, n, b1, t);
- if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
- {
- mpn_sub_n (bsm1, b1, b0, t);
- MPN_ZERO (bsm1 + t, n - t);
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
- }
- else
- {
- mpn_sub (bsm1, b0, n, b1, t);
- }
- }
-
- /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1 */
- mpn_add (bs2, bs1, n+1, b1, t);
- if (flags & toom6_vm1_neg )
- {
- bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
- flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
- }
- else
- {
- bsm2[n] = 0;
- if (t == n)
- {
- if (mpn_cmp (bsm1, b1, n) < 0)
- {
- mpn_sub_n (bsm2, b1, bsm1, n);
- flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
- }
- else
- {
- mpn_sub_n (bsm2, bsm1, b1, n);
- }
- }
- else
- {
- if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
- {
- mpn_sub_n (bsm2, b1, bsm1, t);
- MPN_ZERO (bsm2 + t, n - t);
- flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
- }
- else
- {
- mpn_sub (bsm2, bsm1, n, b1, t);
- }
- }
- }
-
- /* Compute as1 and asm1. */
- flags = (enum toom6_flags) (flags ^ toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2));
-
- ASSERT (as1[n] <= 4);
- ASSERT (bs1[n] <= 1);
- ASSERT (asm1[n] <= 2);
-/* ASSERT (bsm1[n] <= 1); */
- ASSERT (as2[n] <=30);
- ASSERT (bs2[n] <= 2);
- ASSERT (asm2[n] <= 20);
- ASSERT (bsm2[n] <= 1);
-
- /* vm1, 2n+1 limbs */
- mpn_mul (vm1, asm1, n+1, bsm1, n); /* W4 */
-
- /* vm2, 2n+1 limbs */
- mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */
-
- /* v2, 2n+1 limbs */
- mpn_mul_n (v2, as2, bs2, n+1); /* W1 */
-
- /* v1, 2n+1 limbs */
- mpn_mul_n (v1, as1, bs1, n+1); /* W3 */
-
- /* vinf, s+t limbs */ /* W0 */
- if (s > t) mpn_mul (vinf, a4, s, b1, t);
- else mpn_mul (vinf, b1, t, a4, s);
-
- /* v0, 2n limbs */
- mpn_mul_n (v0, ap, bp, n); /* W5 */
-
- mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
-
-#undef v0
-#undef vm1
-#undef v1
-#undef vm2
-#undef v2
-#undef vinf
-#undef bs1
-#undef bs2
-#undef bsm1
-#undef bsm2
-#undef asm1
-#undef asm2
-#undef as1
-#undef as2
-#undef a0a2
-#undef b0b2
-#undef a1a3
-#undef a0
-#undef a1
-#undef a2
-#undef a3
-#undef b0
-#undef b1
-#undef b2
-
-}
diff --git a/gmp/mpn/generic/toom53_mul.c b/gmp/mpn/generic/toom53_mul.c
index 41274d48e0..4483d4dfb7 100644
--- a/gmp/mpn/generic/toom53_mul.c
+++ b/gmp/mpn/generic/toom53_mul.c
@@ -10,39 +10,35 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2008, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch allocation.
+*/
+
#include "gmp.h"
#include "gmp-impl.h"
-/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
+/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
<-s-><--n--><--n--><--n--><--n-->
___ ______ ______ ______ ______
@@ -54,8 +50,8 @@ see https://www.gnu.org/licenses/. */
v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2
vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1
v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6
- vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4
vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6
+ vmh = (16a0-8a1+4a2-2a3+ a4)*(4b0-2b1+ b2) # A(-1/2)*B(-1/2) -9<=ah<=20 -1<=bh<=4
vinf= a4 * b2 # A(inf)*B(inf)
*/
@@ -66,11 +62,12 @@ mpn_toom53_mul (mp_ptr pp,
mp_ptr scratch)
{
mp_size_t n, s, t;
+ int vm1_neg, vmh_neg;
mp_limb_t cy;
- mp_ptr gp;
- mp_ptr as1, asm1, as2, asm2, ash;
- mp_ptr bs1, bsm1, bs2, bsm2, bsh;
- enum toom7_flags flags;
+ mp_ptr gp, hp;
+ mp_ptr as1, asm1, as2, ash, asmh;
+ mp_ptr bs1, bsm1, bs2, bsh, bsmh;
+ enum toom4_flags flags;
TMP_DECL;
#define a0 ap
@@ -95,61 +92,124 @@ mpn_toom53_mul (mp_ptr pp,
as1 = TMP_SALLOC_LIMBS (n + 1);
asm1 = TMP_SALLOC_LIMBS (n + 1);
as2 = TMP_SALLOC_LIMBS (n + 1);
- asm2 = TMP_SALLOC_LIMBS (n + 1);
ash = TMP_SALLOC_LIMBS (n + 1);
+ asmh = TMP_SALLOC_LIMBS (n + 1);
bs1 = TMP_SALLOC_LIMBS (n + 1);
bsm1 = TMP_SALLOC_LIMBS (n + 1);
bs2 = TMP_SALLOC_LIMBS (n + 1);
- bsm2 = TMP_SALLOC_LIMBS (n + 1);
bsh = TMP_SALLOC_LIMBS (n + 1);
+ bsmh = TMP_SALLOC_LIMBS (n + 1);
gp = pp;
+ hp = pp + n + 1;
/* Compute as1 and asm1. */
- flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
-
- /* Compute as2 and asm2. */
- flags = (enum toom7_flags) (flags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp));
+ gp[n] = mpn_add_n (gp, a0, a2, n);
+ gp[n] += mpn_add (gp, gp, n, a4, s);
+ hp[n] = mpn_add_n (hp, a1, a3, n);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_addsub_n (as1, asm1, hp, gp, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_addsub_n (as1, asm1, gp, hp, n + 1);
+ vm1_neg = 0;
+ }
+#else
+ mpn_add_n (as1, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asm1, hp, gp, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_sub_n (asm1, gp, hp, n + 1);
+ vm1_neg = 0;
+ }
+#endif
- /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
- = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */
+ /* Compute as2. */
+#if !HAVE_NATIVE_mpn_addlsh_n
+ ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */
+#endif
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (ash, a1, a0, n);
- cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
- cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
- if (s < n)
+ cy = mpn_addlsh1_n (as2, a3, a4, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
+ as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+ cy = mpn_lshift (as2, a4, s, 1);
+ cy += mpn_add_n (as2, a3, as2, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
+ cy = 4 * cy + mpn_lshift (as2, as2, n, 2);
+ cy += mpn_add_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ as2[n] = cy + mpn_add_n (as2, a0, as2, n);
+ mpn_add_n (as2, ash, as2, n + 1);
+#endif
+
+ /* Compute ash and asmh. */
+#if HAVE_NATIVE_mpn_addlsh_n
+ cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */
+ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */
+ gp[n] = cy;
+ cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */
+ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */
+ hp[n] = cy;
+#else
+ gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */
+ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */
+ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */
+ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */
+ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */
+ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */
+ hp[n] = cy;
+#endif
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (ash, a4, ash, s);
- ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
- MPN_INCR_U (ash + s, n+1-s, cy2);
+ mpn_addsub_n (ash, asmh, hp, gp, n + 1);
+ vmh_neg = 1;
}
else
- ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
+ {
+ mpn_addsub_n (ash, asmh, gp, hp, n + 1);
+ vmh_neg = 0;
+ }
#else
- cy = mpn_lshift (ash, a0, n, 1);
- cy += mpn_add_n (ash, ash, a1, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- cy += mpn_add_n (ash, ash, a2, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- cy += mpn_add_n (ash, ash, a3, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- ash[n] = cy + mpn_add (ash, ash, n, a4, s);
+ mpn_add_n (ash, gp, hp, n + 1);
+ if (mpn_cmp (gp, hp, n + 1) < 0)
+ {
+ mpn_sub_n (asmh, hp, gp, n + 1);
+ vmh_neg = 1;
+ }
+ else
+ {
+ mpn_sub_n (asmh, gp, hp, n + 1);
+ vmh_neg = 0;
+ }
#endif
/* Compute bs1 and bsm1. */
bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
{
- bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
+ bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1;
bsm1[n] = 0;
- flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
+ vm1_neg ^= 1;
}
else
{
- cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
+ cy = mpn_addsub_n (bs1, bsm1, bs1, b1, n);
bsm1[n] = bs1[n] - (cy & 1);
bs1[n] += (cy >> 1);
}
@@ -158,7 +218,7 @@ mpn_toom53_mul (mp_ptr pp,
{
mpn_sub_n (bsm1, b1, bs1, n);
bsm1[n] = 0;
- flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
+ vm1_neg ^= 1;
}
else
{
@@ -167,64 +227,46 @@ mpn_toom53_mul (mp_ptr pp,
bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */
#endif
- /* Compute bs2 and bsm2. */
-#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
-#if HAVE_NATIVE_mpn_addlsh2_n
- cy = mpn_addlsh2_n (bs2, b0, b2, t);
-#else /* HAVE_NATIVE_mpn_addlsh_n */
- cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
-#endif
- if (t < n)
- cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
- bs2[n] = cy;
+ /* Compute bs2 */
+ hp[n] = mpn_lshift (hp, b1, n, 1); /* 2b1 */
+
+#ifdef HAVE_NATIVE_mpn_addlsh1_n
+ cy = mpn_addlsh1_n (bs2, b1, b2, t);
+ if (t != n)
+ cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
+ bs2[n] = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
#else
- cy = mpn_lshift (gp, b2, t, 2);
- bs2[n] = mpn_add (bs2, b0, n, gp, t);
- MPN_INCR_U (bs2 + t, n+1-t, cy);
+ bs2[t] = mpn_lshift (bs2, b2, t, 2);
+ mpn_add (bs2, hp, n + 1, bs2, t + 1);
+ bs2[n] += mpn_add_n (bs2, bs2, b0, n);
#endif
- gp[n] = mpn_lshift (gp, b1, n, 1);
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (mpn_cmp (bs2, gp, n+1) < 0)
+ /* Compute bsh and bsmh. */
+#if HAVE_NATIVE_mpn_addlsh_n
+ gp[n] = mpn_addlsh_n (gp, b2, b0, n, 2); /* 4a0 + a2 */
+#else
+ cy = mpn_lshift (gp, b0, n, 2); /* 4b0 */
+ gp[n] = cy + mpn_add (gp, gp, n, b2, t); /* 4b0 + b2 */
+#endif
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
- flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
+ mpn_addsub_n (bsh, bsmh, hp, gp, n + 1);
+ vmh_neg^= 1;
}
else
- {
- ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
- }
+ mpn_addsub_n (bsh, bsmh, gp, hp, n + 1);
#else
- if (mpn_cmp (bs2, gp, n+1) < 0)
+ mpn_add_n (bsh, gp, hp, n + 1); /* 4b0 + 2b1 + b2 */
+ if (mpn_cmp (gp, hp, n + 1) < 0)
{
- ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
- flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
+ mpn_sub_n (bsmh, hp, gp, n + 1);
+ vmh_neg ^= 1;
}
else
{
- ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
+ mpn_sub_n (bsmh, gp, hp, n + 1);
}
- mpn_add_n (bs2, bs2, gp, n+1);
-#endif
-
- /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2. */
-#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (bsh, b1, b0, n);
- if (t < n)
- {
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
- bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
- MPN_INCR_U (bsh + t, n+1-t, cy2);
- }
- else
- bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
-#else
- cy = mpn_lshift (bsh, b0, n, 1);
- cy += mpn_add_n (bsh, bsh, b1, n);
- cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
- bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
#endif
ASSERT (as1[n] <= 4);
@@ -233,26 +275,18 @@ mpn_toom53_mul (mp_ptr pp,
ASSERT (bsm1[n] <= 1);
ASSERT (as2[n] <= 30);
ASSERT (bs2[n] <= 6);
- ASSERT (asm2[n] <= 20);
- ASSERT (bsm2[n] <= 4);
ASSERT (ash[n] <= 30);
ASSERT (bsh[n] <= 6);
+ ASSERT (asmh[n] <= 20);
+ ASSERT (bsmh[n] <= 4);
#define v0 pp /* 2n */
-#define v1 (pp + 2 * n) /* 2n+1 */
+#define v1 (scratch + 6 * n + 6) /* 2n+1 */
+#define vm1 scratch /* 2n+1 */
+#define v2 (scratch + 2 * n + 2) /* 2n+1 */
#define vinf (pp + 6 * n) /* s+t */
-#define v2 scratch /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define vh (scratch + 4 * n + 2) /* 2n+1 */
-#define vm1 (scratch + 6 * n + 3) /* 2n+1 */
-#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
- /* Total scratch need: 10*n+5 */
-
- /* Must be in allocation order, as they overwrite one limb beyond
- * 2n+1. */
- mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
- mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
- mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
+#define vh (pp + 2 * n) /* 2n+1 */
+#define vmh (scratch + 4 * n + 4)
/* vm1, 2n+1 limbs */
#ifdef SMALLER_RECURSION
@@ -279,6 +313,12 @@ mpn_toom53_mul (mp_ptr pp,
mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
#endif /* SMALLER_RECURSION */
+ mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
+
+ /* vinf, s+t limbs */
+ if (s > t) mpn_mul (vinf, a4, s, b2, t);
+ else mpn_mul (vinf, b2, t, a4, s);
+
/* v1, 2n+1 limbs */
#ifdef SMALLER_RECURSION
mpn_mul_n (v1, as1, bs1, n);
@@ -318,14 +358,16 @@ mpn_toom53_mul (mp_ptr pp,
mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
#endif /* SMALLER_RECURSION */
- mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
+ mpn_mul_n (vh, ash, bsh, n + 1);
- /* vinf, s+t limbs */
- if (s > t) mpn_mul (vinf, a4, s, b2, t);
- else mpn_mul (vinf, b2, t, a4, s);
+ mpn_mul_n (vmh, asmh, bsmh, n + 1);
+
+ mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
+
+ flags = vm1_neg ? toom4_w3_neg : 0;
+ flags |= vmh_neg ? toom4_w1_neg : 0;
- mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
- scratch_out);
+ mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8);
TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom54_mul.c b/gmp/mpn/generic/toom54_mul.c
deleted file mode 100644
index 939bb53ab6..0000000000
--- a/gmp/mpn/generic/toom54_mul.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Implementation of the algorithm for Toom-Cook 4.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-/* Toom-4.5, the splitting 5x4 unbalanced version.
- Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0.
-
- <--s-><--n--><--n--><--n--><--n-->
- ____ ______ ______ ______ ______
- |_a4_|__a3__|__a2__|__a1__|__a0__|
- |b3_|__b2__|__b1__|__b0__|
- <-t-><--n--><--n--><--n-->
-
-*/
-#define TOOM_54_MUL_N_REC(p, a, b, n, ws) \
- do { mpn_mul_n (p, a, b, n); \
- } while (0)
-
-#define TOOM_54_MUL_REC(p, a, na, b, nb, ws) \
- do { mpn_mul (p, a, na, b, nb); \
- } while (0)
-
-void
-mpn_toom54_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- int sign;
-
- /***************************** decomposition *******************************/
-#define a4 (ap + 4 * n)
-#define b3 (bp + 3 * n)
-
- ASSERT (an >= bn);
- n = 1 + (4 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 4);
-
- s = an - 4 * n;
- t = bn - 3 * n;
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
- /* Required by mpn_toom_interpolate_8pts. */
- ASSERT ( s + t >= n );
- ASSERT ( s + t > 4);
- ASSERT ( n > 2);
-
-#define r8 pp /* 2n */
-#define r7 scratch /* 3n+1 */
-#define r5 (pp + 3*n) /* 3n+1 */
-#define v0 (pp + 3*n) /* n+1 */
-#define v1 (pp + 4*n+1) /* n+1 */
-#define v2 (pp + 5*n+2) /* n+1 */
-#define v3 (pp + 6*n+3) /* n+1 */
-#define r3 (scratch + 3 * n + 1) /* 3n+1 */
-#define r1 (pp + 7*n) /* s+t <= 2*n */
-#define ws (scratch + 6 * n + 2) /* ??? */
-
- /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may
- need all of them, when DO_mpn_sublsh_n usea a scratch */
- /********************** evaluation and recursive calls *********************/
- /* $\pm4$ */
- sign = mpn_toom_eval_pm2exp (v2, v0, 4, ap, n, s, 2, pp)
- ^ mpn_toom_eval_pm2exp (v3, v1, 3, bp, n, t, 2, pp);
- TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */
- TOOM_54_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */
- mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4);
-
- /* $\pm1$ */
- sign = mpn_toom_eval_pm1 (v2, v0, 4, ap, n, s, pp)
- ^ mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp);
- TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */
- TOOM_54_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */
- mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0);
-
- /* $\pm2$ */
- sign = mpn_toom_eval_pm2 (v2, v0, 4, ap, n, s, pp)
- ^ mpn_toom_eval_dgr3_pm2 (v3, v1, bp, n, t, pp);
- TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */
- TOOM_54_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */
- mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2);
-
- /* A(0)*B(0) */
- TOOM_54_MUL_N_REC(pp, ap, bp, n, ws);
-
- /* Infinity */
- if (s > t) {
- TOOM_54_MUL_REC(r1, a4, s, b3, t, ws);
- } else {
- TOOM_54_MUL_REC(r1, b3, t, a4, s, ws);
- };
-
- mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws);
-
-#undef a4
-#undef b3
-#undef r1
-#undef r3
-#undef r5
-#undef v0
-#undef v1
-#undef v2
-#undef v3
-#undef r7
-#undef r8
-#undef ws
-}
diff --git a/gmp/mpn/generic/toom62_mul.c b/gmp/mpn/generic/toom62_mul.c
index 3759e3cb3c..944b3feffd 100644
--- a/gmp/mpn/generic/toom62_mul.c
+++ b/gmp/mpn/generic/toom62_mul.c
@@ -10,42 +10,38 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2006-2008, 2012 Free Software Foundation, Inc.
+Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+/*
+ Things to work on:
+
+ 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be
+ avoided by instead reusing the pp area and the scratch allocation.
+*/
+
#include "gmp.h"
#include "gmp-impl.h"
-/* Evaluate in:
- 0, +1, -1, +2, -2, 1/2, +inf
- <-s-><--n--><--n--><--n--><--n--><--n-->
+/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
+
+ <-s-><--n--><--n--><--n-->
___ ______ ______ ______ ______ ______
|a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
|_b1_|___b0_|
@@ -55,8 +51,8 @@ see https://www.gnu.org/licenses/. */
v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1
vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2
- vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0
vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2
+ vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0<=bh<=1
vinf= a5 * b1 # A(inf)*B(inf)
*/
@@ -67,11 +63,12 @@ mpn_toom62_mul (mp_ptr pp,
mp_ptr scratch)
{
mp_size_t n, s, t;
+ int vm1_neg, vmh_neg, bsm_neg;
mp_limb_t cy;
- mp_ptr as1, asm1, as2, asm2, ash;
- mp_ptr bs1, bsm1, bs2, bsm2, bsh;
- mp_ptr gp;
- enum toom7_flags aflags, bflags;
+ mp_ptr a0_a2, a1_a3;
+ mp_ptr as1, asm1, as2, ash, asmh;
+ mp_ptr bs1, bsm1, bs2, bsh, bsmh;
+ enum toom4_flags flags;
TMP_DECL;
#define a0 ap
@@ -83,7 +80,7 @@ mpn_toom62_mul (mp_ptr pp,
#define b0 bp
#define b1 (bp + n)
- n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
+ n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1);
s = an - 5 * n;
t = bn - n;
@@ -96,66 +93,133 @@ mpn_toom62_mul (mp_ptr pp,
as1 = TMP_SALLOC_LIMBS (n + 1);
asm1 = TMP_SALLOC_LIMBS (n + 1);
as2 = TMP_SALLOC_LIMBS (n + 1);
- asm2 = TMP_SALLOC_LIMBS (n + 1);
ash = TMP_SALLOC_LIMBS (n + 1);
+ asmh = TMP_SALLOC_LIMBS (n + 1);
bs1 = TMP_SALLOC_LIMBS (n + 1);
bsm1 = TMP_SALLOC_LIMBS (n);
bs2 = TMP_SALLOC_LIMBS (n + 1);
- bsm2 = TMP_SALLOC_LIMBS (n + 1);
bsh = TMP_SALLOC_LIMBS (n + 1);
+ bsmh = TMP_SALLOC_LIMBS (n + 1);
- gp = pp;
+ a0_a2 = pp;
+ a1_a3 = pp + n + 1;
/* Compute as1 and asm1. */
- aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
-
- /* Compute as2 and asm2. */
- aflags = (enum toom7_flags) (aflags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp));
-
- /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
- = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */
+ a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n);
+ a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n);
+ a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n);
+ a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s);
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
+ {
+ mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1);
+ vm1_neg = 0;
+ }
+#else
+ mpn_add_n (as1, a0_a2, a1_a3, n + 1);
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
+ {
+ mpn_sub_n (asm1, a1_a3, a0_a2, n + 1);
+ vm1_neg = 1;
+ }
+ else
+ {
+ mpn_sub_n (asm1, a0_a2, a1_a3, n + 1);
+ vm1_neg = 0;
+ }
+#endif
+ /* Compute as2. */
#if HAVE_NATIVE_mpn_addlsh1_n
- cy = mpn_addlsh1_n (ash, a1, a0, n);
- cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
- cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
- cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
- if (s < n)
+ cy = mpn_addlsh1_n (as2, a4, a5, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+ cy = mpn_lshift (as2, a5, s, 1);
+ cy += mpn_add_n (as2, a4, as2, s);
+ if (s != n)
+ cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a3, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a2, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a1, as2, n);
+ cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+ cy += mpn_add_n (as2, a0, as2, n);
+#endif
+ as2[n] = cy;
+
+ /* Compute ash and asmh. */
+#if HAVE_NATIVE_mpn_addlsh_n
+ cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */
+ cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */
+ cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
+ a0_a2[n] = cy;
+ cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */
+ cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */
+ a1_a3[n] = cy;
+#else
+ cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */
+ cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */
+ cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */
+ cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */
+ cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */
+ a0_a2[n] = cy;
+ cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */
+ cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */
+ cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */
+ cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */
+ a1_a3[n] = cy;
+#endif
+#if HAVE_NATIVE_mpn_addsub_n
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
{
- mp_limb_t cy2;
- cy2 = mpn_addlsh1_n (ash, a5, ash, s);
- ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
- MPN_INCR_U (ash + s, n+1-s, cy2);
+ mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1);
+ vmh_neg = 1;
}
else
- ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
+ {
+ mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1);
+ vmh_neg = 0;
+ }
#else
- cy = mpn_lshift (ash, a0, n, 1);
- cy += mpn_add_n (ash, ash, a1, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- cy += mpn_add_n (ash, ash, a2, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- cy += mpn_add_n (ash, ash, a3, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- cy += mpn_add_n (ash, ash, a4, n);
- cy = 2*cy + mpn_lshift (ash, ash, n, 1);
- ash[n] = cy + mpn_add (ash, ash, n, a5, s);
+ mpn_add_n (ash, a0_a2, a1_a3, n + 1);
+ if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
+ {
+ mpn_sub_n (asmh, a1_a3, a0_a2, n + 1);
+ vmh_neg = 1;
+ }
+ else
+ {
+ mpn_sub_n (asmh, a0_a2, a1_a3, n + 1);
+ vmh_neg = 0;
+ }
#endif
/* Compute bs1 and bsm1. */
if (t == n)
{
-#if HAVE_NATIVE_mpn_add_n_sub_n
+#if HAVE_NATIVE_mpn_addsub_n
if (mpn_cmp (b0, b1, n) < 0)
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
- bflags = toom7_w3_neg;
+ cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
+ bsm_neg = 1;
}
else
{
- cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
- bflags = (enum toom7_flags) 0;
+ cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
+ bsm_neg = 0;
}
bs1[n] = cy >> 1;
#else
@@ -163,12 +227,12 @@ mpn_toom62_mul (mp_ptr pp,
if (mpn_cmp (b0, b1, n) < 0)
{
mpn_sub_n (bsm1, b1, b0, n);
- bflags = toom7_w3_neg;
+ bsm_neg = 1;
}
else
{
mpn_sub_n (bsm1, b0, b1, n);
- bflags = (enum toom7_flags) 0;
+ bsm_neg = 0;
}
#endif
}
@@ -179,83 +243,56 @@ mpn_toom62_mul (mp_ptr pp,
{
mpn_sub_n (bsm1, b1, b0, t);
MPN_ZERO (bsm1 + t, n - t);
- bflags = toom7_w3_neg;
+ bsm_neg = 1;
}
else
{
mpn_sub (bsm1, b0, n, b1, t);
- bflags = (enum toom7_flags) 0;
+ bsm_neg = 0;
}
}
- /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
- bsm1 - b1 */
+ vm1_neg ^= bsm_neg;
+
+ /* Compute bs2, recycling bs1. bs2=bs1+b1 */
mpn_add (bs2, bs1, n + 1, b1, t);
- if (bflags & toom7_w3_neg)
- {
- bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
- bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
- }
- else
+
+ /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */
+ if (bsm_neg == 1)
{
- /* FIXME: Simplify this logic? */
- if (t < n)
+ bsmh[n] = 0;
+ if (mpn_cmp (bsm1, b0, n) < 0)
{
- if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
- {
- ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
- MPN_ZERO (bsm2 + t, n + 1 - t);
- bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
- }
- else
- {
- ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
- bsm2[n] = 0;
- }
+ bsm_neg = 0;
+ mpn_sub_n (bsmh, b0, bsm1, n);
}
else
- {
- if (mpn_cmp (bsm1, b1, n) < 0)
- {
- ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
- bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
- }
- else
- {
- ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
- }
- bsm2[n] = 0;
- }
+ mpn_sub_n (bsmh, bsm1, b0, n);
}
+ else
+ bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n);
+ mpn_add (bsh, bs1, n + 1, b0, n);
+ vmh_neg ^= bsm_neg;
- /* Compute bsh, recycling bs1. bsh=bs1+b0; */
- bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
ASSERT (as1[n] <= 5);
ASSERT (bs1[n] <= 1);
ASSERT (asm1[n] <= 2);
+/*ASSERT (bsm1[n] == 0);*/
ASSERT (as2[n] <= 62);
ASSERT (bs2[n] <= 2);
- ASSERT (asm2[n] <= 41);
- ASSERT (bsm2[n] <= 1);
ASSERT (ash[n] <= 62);
ASSERT (bsh[n] <= 2);
+ ASSERT (asmh[n] <= 41);
+ ASSERT (bsmh[n] <= 1);
#define v0 pp /* 2n */
-#define v1 (pp + 2 * n) /* 2n+1 */
+#define v1 (scratch + 6 * n + 6) /* 2n+1 */
#define vinf (pp + 6 * n) /* s+t */
-#define v2 scratch /* 2n+1 */
-#define vm2 (scratch + 2 * n + 1) /* 2n+1 */
-#define vh (scratch + 4 * n + 2) /* 2n+1 */
-#define vm1 (scratch + 6 * n + 3) /* 2n+1 */
-#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
- /* Total scratch need: 10*n+5 */
-
- /* Must be in allocation order, as they overwrite one limb beyond
- * 2n+1. */
- mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
- mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
- mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
+#define vm1 scratch /* 2n+1 */
+#define v2 (scratch + 2 * n + 2) /* 2n+1 */
+#define vh (pp + 2 * n) /* 2n+1 */
+#define vmh (scratch + 4 * n + 4)
/* vm1, 2n+1 limbs */
mpn_mul_n (vm1, asm1, bsm1, n);
@@ -274,6 +311,12 @@ mpn_toom62_mul (mp_ptr pp,
}
vm1[2 * n] = cy;
+ mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
+
+ /* vinf, s+t limbs */
+ if (s > t) mpn_mul (vinf, a5, s, b1, t);
+ else mpn_mul (vinf, b1, t, a5, s);
+
/* v1, 2n+1 limbs */
mpn_mul_n (v1, as1, bs1, n);
if (as1[n] == 1)
@@ -298,14 +341,16 @@ mpn_toom62_mul (mp_ptr pp,
cy += mpn_add_n (v1 + n, v1 + n, as1, n);
v1[2 * n] = cy;
- mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
+ mpn_mul_n (vh, ash, bsh, n + 1);
- /* vinf, s+t limbs */
- if (s > t) mpn_mul (vinf, a5, s, b1, t);
- else mpn_mul (vinf, b1, t, a5, s);
+ mpn_mul_n (vmh, asmh, bsmh, n + 1);
+
+ mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */
+
+ flags = vm1_neg ? toom4_w3_neg : 0;
+ flags |= vmh_neg ? toom4_w1_neg : 0;
- mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
- vm2, vm1, v2, vh, s + t, scratch_out);
+ mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8);
TMP_FREE;
}
diff --git a/gmp/mpn/generic/toom63_mul.c b/gmp/mpn/generic/toom63_mul.c
deleted file mode 100644
index 57c5d3e3dd..0000000000
--- a/gmp/mpn/generic/toom63_mul.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Implementation of the algorithm for Toom-Cook 4.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Stores |{ap,n}-{bp,n}| in {rp,n}, returns the sign. */
-static int
-abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
-{
- mp_limb_t x, y;
- while (--n >= 0)
- {
- x = ap[n];
- y = bp[n];
- if (x != y)
- {
- n++;
- if (x > y)
- {
- mpn_sub_n (rp, ap, bp, n);
- return 0;
- }
- else
- {
- mpn_sub_n (rp, bp, ap, n);
- return ~0;
- }
- }
- rp[n] = 0;
- }
- return 0;
-}
-
-static int
-abs_sub_add_n (mp_ptr rm, mp_ptr rp, mp_srcptr rs, mp_size_t n) {
- int result;
- result = abs_sub_n (rm, rp, rs, n);
- ASSERT_NOCARRY(mpn_add_n (rp, rp, rs, n));
- return result;
-}
-
-
-/* Toom-4.5, the splitting 6x3 unbalanced version.
- Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0.
-
- <--s-><--n--><--n--><--n--><--n--><--n-->
- ____ ______ ______ ______ ______ ______
- |_a5_|__a4__|__a3__|__a2__|__a1__|__a0__|
- |b2_|__b1__|__b0__|
- <-t-><--n--><--n-->
-
-*/
-#define TOOM_63_MUL_N_REC(p, a, b, n, ws) \
- do { mpn_mul_n (p, a, b, n); \
- } while (0)
-
-#define TOOM_63_MUL_REC(p, a, na, b, nb, ws) \
- do { mpn_mul (p, a, na, b, nb); \
- } while (0)
-
-void
-mpn_toom63_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- mp_limb_t cy;
- int sign;
-
- /***************************** decomposition *******************************/
-#define a5 (ap + 5 * n)
-#define b0 (bp + 0 * n)
-#define b1 (bp + 1 * n)
-#define b2 (bp + 2 * n)
-
- ASSERT (an >= bn);
- n = 1 + (an >= 2 * bn ? (an - 1) / (size_t) 6 : (bn - 1) / (size_t) 3);
-
- s = an - 5 * n;
- t = bn - 2 * n;
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
- /* WARNING! it assumes s+t>=n */
- ASSERT ( s + t >= n );
- ASSERT ( s + t > 4);
- /* WARNING! it assumes n>1 */
- ASSERT ( n > 2);
-
-#define r8 pp /* 2n */
-#define r7 scratch /* 3n+1 */
-#define r5 (pp + 3*n) /* 3n+1 */
-#define v0 (pp + 3*n) /* n+1 */
-#define v1 (pp + 4*n+1) /* n+1 */
-#define v2 (pp + 5*n+2) /* n+1 */
-#define v3 (pp + 6*n+3) /* n+1 */
-#define r3 (scratch + 3 * n + 1) /* 3n+1 */
-#define r1 (pp + 7*n) /* s+t <= 2*n */
-#define ws (scratch + 6 * n + 2) /* ??? */
-
- /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may
- need all of them, when DO_mpn_sublsh_n usea a scratch */
-/* if (scratch == NULL) scratch = TMP_SALLOC_LIMBS (9 * n + 3); */
-
- /********************** evaluation and recursive calls *********************/
- /* $\pm4$ */
- sign = mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp);
- pp[n] = mpn_lshift (pp, b1, n, 2); /* 4b1 */
- /* FIXME: use addlsh */
- v3[t] = mpn_lshift (v3, b2, t, 4);/* 16b2 */
- if ( n == t )
- v3[n]+= mpn_add_n (v3, v3, b0, n); /* 16b2+b0 */
- else
- v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 16b2+b0 */
- sign ^= abs_sub_add_n (v1, v3, pp, n + 1);
- TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */
- TOOM_63_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */
- mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4);
-
- /* $\pm1$ */
- sign = mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp);
- /* Compute bs1 and bsm1. Code taken from toom33 */
- cy = mpn_add (ws, b0, n, b2, t);
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (cy == 0 && mpn_cmp (ws, b1, n) < 0)
- {
- cy = mpn_add_n_sub_n (v3, v1, b1, ws, n);
- v3[n] = cy >> 1;
- v1[n] = 0;
- sign = ~sign;
- }
- else
- {
- mp_limb_t cy2;
- cy2 = mpn_add_n_sub_n (v3, v1, ws, b1, n);
- v3[n] = cy + (cy2 >> 1);
- v1[n] = cy - (cy2 & 1);
- }
-#else
- v3[n] = cy + mpn_add_n (v3, ws, b1, n);
- if (cy == 0 && mpn_cmp (ws, b1, n) < 0)
- {
- mpn_sub_n (v1, b1, ws, n);
- v1[n] = 0;
- sign = ~sign;
- }
- else
- {
- cy -= mpn_sub_n (v1, ws, b1, n);
- v1[n] = cy;
- }
-#endif
- TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */
- TOOM_63_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */
- mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0);
-
- /* $\pm2$ */
- sign = mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp);
- pp[n] = mpn_lshift (pp, b1, n, 1); /* 2b1 */
- /* FIXME: use addlsh or addlsh2 */
- v3[t] = mpn_lshift (v3, b2, t, 2);/* 4b2 */
- if ( n == t )
- v3[n]+= mpn_add_n (v3, v3, b0, n); /* 4b2+b0 */
- else
- v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 4b2+b0 */
- sign ^= abs_sub_add_n (v1, v3, pp, n + 1);
- TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */
- TOOM_63_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */
- mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2);
-
- /* A(0)*B(0) */
- TOOM_63_MUL_N_REC(pp, ap, bp, n, ws);
-
- /* Infinity */
- if (s > t) {
- TOOM_63_MUL_REC(r1, a5, s, b2, t, ws);
- } else {
- TOOM_63_MUL_REC(r1, b2, t, a5, s, ws);
- };
-
- mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws);
-
-#undef a5
-#undef b0
-#undef b1
-#undef b2
-#undef r1
-#undef r3
-#undef r5
-#undef v0
-#undef v1
-#undef v2
-#undef v3
-#undef r7
-#undef r8
-#undef ws
-}
diff --git a/gmp/mpn/generic/toom6_sqr.c b/gmp/mpn/generic/toom6_sqr.c
deleted file mode 100644
index e5ab7dcd1d..0000000000
--- a/gmp/mpn/generic/toom6_sqr.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Implementation of the squaring algorithm with Toom-Cook 6.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#if GMP_NUMB_BITS < 21
-#error Not implemented.
-#endif
-
-
-#if TUNE_PROGRAM_BUILD
-#define MAYBE_sqr_basecase 1
-#define MAYBE_sqr_above_basecase 1
-#define MAYBE_sqr_toom2 1
-#define MAYBE_sqr_above_toom2 1
-#define MAYBE_sqr_toom3 1
-#define MAYBE_sqr_above_toom3 1
-#define MAYBE_sqr_above_toom4 1
-#else
-#ifdef SQR_TOOM8_THRESHOLD
-#define SQR_TOOM6_MAX ((SQR_TOOM8_THRESHOLD+6*2-1+5)/6)
-#else
-#define SQR_TOOM6_MAX \
- ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (6*2-1+5)) ? \
- ((SQR_FFT_THRESHOLD+6*2-1+5)/6) \
- : MP_SIZE_T_MAX )
-#endif
-#define MAYBE_sqr_basecase \
- (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD)
-#define MAYBE_sqr_above_basecase \
- (SQR_TOOM6_MAX >= SQR_TOOM2_THRESHOLD)
-#define MAYBE_sqr_toom2 \
- (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD)
-#define MAYBE_sqr_above_toom2 \
- (SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD)
-#define MAYBE_sqr_toom3 \
- (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD)
-#define MAYBE_sqr_above_toom3 \
- (SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD)
-#define MAYBE_sqr_above_toom4 \
- (SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD)
-#endif
-
-#define TOOM6_SQR_REC(p, a, n, ws) \
- do { \
- if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \
- || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) \
- mpn_sqr_basecase (p, a, n); \
- else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \
- || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) \
- mpn_toom2_sqr (p, a, n, ws); \
- else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \
- || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) \
- mpn_toom3_sqr (p, a, n, ws); \
- else if (! MAYBE_sqr_above_toom4 \
- || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) \
- mpn_toom4_sqr (p, a, n, ws); \
- else \
- mpn_toom6_sqr (p, a, n, ws); \
- } while (0)
-
-void
-mpn_toom6_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
-{
- mp_size_t n, s;
-
- /***************************** decomposition *******************************/
-
- ASSERT( an >= 18 );
-
- n = 1 + (an - 1) / (size_t) 6;
-
- s = an - 5 * n;
-
- ASSERT (0 < s && s <= n);
-
-#define r4 (pp + 3 * n) /* 3n+1 */
-#define r2 (pp + 7 * n) /* 3n+1 */
-#define r0 (pp +11 * n) /* s+t <= 2*n */
-#define r5 (scratch) /* 3n+1 */
-#define r3 (scratch + 3 * n + 1) /* 3n+1 */
-#define r1 (scratch + 6 * n + 2) /* 3n+1 */
-#define v0 (pp + 7 * n) /* n+1 */
-#define v2 (pp + 9 * n+2) /* n+1 */
-#define wse (scratch + 9 * n + 3) /* 3n+1 */
-
- /* Alloc also 3n+1 limbs for ws... toom_interpolate_12pts may
- need all of them, when DO_mpn_sublsh_n usea a scratch */
-/* if (scratch== NULL) */
-/* scratch = TMP_SALLOC_LIMBS (12 * n + 6); */
-
- /********************** evaluation and recursive calls *********************/
- /* $\pm1/2$ */
- mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 1, pp);
- TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/2)*B(-1/2)*2^. */
- TOOM6_SQR_REC(r5, v2, n + 1, wse); /* A(+1/2)*B(+1/2)*2^. */
- mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 1, 0);
-
- /* $\pm1$ */
- mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp);
- TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1)*B(-1) */
- TOOM6_SQR_REC(r3, v2, n + 1, wse); /* A(1)*B(1) */
- mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 0, 0);
-
- /* $\pm4$ */
- mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp);
- TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-4)*B(-4) */
- TOOM6_SQR_REC(r1, v2, n + 1, wse); /* A(+4)*B(+4) */
- mpn_toom_couple_handling (r1, 2 * n + 1, pp, 0, n, 2, 4);
-
- /* $\pm1/4$ */
- mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 2, pp);
- TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/4)*B(-1/4)*4^. */
- TOOM6_SQR_REC(r4, v2, n + 1, wse); /* A(+1/4)*B(+1/4)*4^. */
- mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 2, 0);
-
- /* $\pm2$ */
- mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp);
- TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-2)*B(-2) */
- TOOM6_SQR_REC(r2, v2, n + 1, wse); /* A(+2)*B(+2) */
- mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 1, 2);
-
-#undef v0
-#undef v2
-
- /* A(0)*B(0) */
- TOOM6_SQR_REC(pp, ap, n, wse);
-
- mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, 2 * s, 0, wse);
-
-#undef r0
-#undef r1
-#undef r2
-#undef r3
-#undef r4
-#undef r5
-
-}
-#undef TOOM6_SQR_REC
-#undef MAYBE_sqr_basecase
-#undef MAYBE_sqr_above_basecase
-#undef MAYBE_sqr_toom2
-#undef MAYBE_sqr_above_toom2
-#undef MAYBE_sqr_toom3
-#undef MAYBE_sqr_above_toom3
-#undef MAYBE_sqr_above_toom4
diff --git a/gmp/mpn/generic/toom6h_mul.c b/gmp/mpn/generic/toom6h_mul.c
deleted file mode 100644
index 420895be8f..0000000000
--- a/gmp/mpn/generic/toom6h_mul.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Implementation of the multiplication algorithm for Toom-Cook 6.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#if GMP_NUMB_BITS < 21
-#error Not implemented.
-#endif
-
-#if TUNE_PROGRAM_BUILD
-#define MAYBE_mul_basecase 1
-#define MAYBE_mul_toom22 1
-#define MAYBE_mul_toom33 1
-#define MAYBE_mul_toom6h 1
-#else
-#define MAYBE_mul_basecase \
- (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM22_THRESHOLD)
-#define MAYBE_mul_toom22 \
- (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM33_THRESHOLD)
-#define MAYBE_mul_toom33 \
- (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM44_THRESHOLD)
-#define MAYBE_mul_toom6h \
- (MUL_FFT_THRESHOLD >= 6 * MUL_TOOM6H_THRESHOLD)
-#endif
-
-#define TOOM6H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \
- do { \
- if (MAYBE_mul_basecase \
- && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \
- mpn_mul_basecase (p, a, n, b, n); \
- if (f) \
- mpn_mul_basecase (p2, a2, n, b2, n); \
- } else if (MAYBE_mul_toom22 \
- && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \
- mpn_toom22_mul (p, a, n, b, n, ws); \
- if (f) \
- mpn_toom22_mul (p2, a2, n, b2, n, ws); \
- } else if (MAYBE_mul_toom33 \
- && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \
- mpn_toom33_mul (p, a, n, b, n, ws); \
- if (f) \
- mpn_toom33_mul (p2, a2, n, b2, n, ws); \
- } else if (! MAYBE_mul_toom6h \
- || BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \
- mpn_toom44_mul (p, a, n, b, n, ws); \
- if (f) \
- mpn_toom44_mul (p2, a2, n, b2, n, ws); \
- } else { \
- mpn_toom6h_mul (p, a, n, b, n, ws); \
- if (f) \
- mpn_toom6h_mul (p2, a2, n, b2, n, ws); \
- } \
- } while (0)
-
-#define TOOM6H_MUL_REC(p, a, na, b, nb, ws) \
- do { mpn_mul (p, a, na, b, nb); \
- } while (0)
-
-/* Toom-6.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn}
- With: an >= bn >= 46, an*6 < bn * 17.
- It _may_ work with bn<=46 and bn*17 < an*6 < bn*18
-
- Evaluate in: infinity, +4, -4, +2, -2, +1, -1, +1/2, -1/2, +1/4, -1/4, 0.
-*/
-/* Estimate on needed scratch:
- S(n) <= (n+5)\6*10+4+MAX(S((n+5)\6),1+2*(n+5)\6),
- since n>42; S(n) <= ceil(log(n)/log(6))*(10+4)+n*12\6 < n*2 + lg2(n)*6
- */
-
-void
-mpn_toom6h_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- int p, q, half;
- int sign;
-
- /***************************** decomposition *******************************/
-
- ASSERT (an >= bn);
- /* Can not handle too much unbalancement */
- ASSERT (bn >= 42);
- /* Can not handle too much unbalancement */
- ASSERT ((an*3 < bn * 8) || (bn >= 46 && an * 6 < bn * 17));
-
- /* Limit num/den is a rational number between
- (12/11)^(log(4)/log(2*4-1)) and (12/11)^(log(6)/log(2*6-1)) */
-#define LIMIT_numerator (18)
-#define LIMIT_denominat (17)
-
- if (LIKELY (an * LIMIT_denominat < LIMIT_numerator * bn)) /* is 6*... < 6*... */
- {
- n = 1 + (an - 1) / (size_t) 6;
- p = q = 5;
- half = 0;
-
- s = an - 5 * n;
- t = bn - 5 * n;
- }
- else {
- if (an * 5 * LIMIT_numerator < LIMIT_denominat * 7 * bn)
- { p = 7; q = 6; }
- else if (an * 5 * LIMIT_denominat < LIMIT_numerator * 7 * bn)
- { p = 7; q = 5; }
- else if (an * LIMIT_numerator < LIMIT_denominat * 2 * bn) /* is 4*... < 8*... */
- { p = 8; q = 5; }
- else if (an * LIMIT_denominat < LIMIT_numerator * 2 * bn) /* is 4*... < 8*... */
- { p = 8; q = 4; }
- else
- { p = 9; q = 4; }
-
- half = (p ^ q) & 1;
- n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q);
- p--; q--;
-
- s = an - p * n;
- t = bn - q * n;
-
- /* With LIMIT = 16/15, the following recover is needed only if bn<=73*/
- if (half) { /* Recover from badly chosen splitting */
- if (UNLIKELY (s<1)) {p--; s+=n; half=0;}
- else if (UNLIKELY (t<1)) {q--; t+=n; half=0;}
- }
- }
-#undef LIMIT_numerator
-#undef LIMIT_denominat
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
- ASSERT (half || s + t > 3);
- ASSERT (n > 2);
-
-#define r4 (pp + 3 * n) /* 3n+1 */
-#define r2 (pp + 7 * n) /* 3n+1 */
-#define r0 (pp +11 * n) /* s+t <= 2*n */
-#define r5 (scratch) /* 3n+1 */
-#define r3 (scratch + 3 * n + 1) /* 3n+1 */
-#define r1 (scratch + 6 * n + 2) /* 3n+1 */
-#define v0 (pp + 7 * n) /* n+1 */
-#define v1 (pp + 8 * n+1) /* n+1 */
-#define v2 (pp + 9 * n+2) /* n+1 */
-#define v3 (scratch + 9 * n + 3) /* n+1 */
-#define wsi (scratch + 9 * n + 3) /* 3n+1 */
-#define wse (scratch +10 * n + 4) /* 2n+1 */
-
- /* Alloc also 3n+1 limbs for wsi... toom_interpolate_12pts may
- need all of them */
-/* if (scratch == NULL) */
-/* scratch = TMP_SALLOC_LIMBS(mpn_toom6_sqr_itch(n * 6)); */
- ASSERT (12 * n + 6 <= mpn_toom6h_mul_itch(an,bn));
- ASSERT (12 * n + 6 <= mpn_toom6_sqr_itch(n * 6));
-
- /********************** evaluation and recursive calls *********************/
- /* $\pm1/2$ */
- sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^
- mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp);
- /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
- TOOM6H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 1+half , half);
-
- /* $\pm1$ */
- sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp);
- if (UNLIKELY (q == 3))
- sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp);
- else
- sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp);
- /* A(-1)*B(-1) */ /* A(1)*B(1) */
- TOOM6H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 0, 0);
-
- /* $\pm4$ */
- sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^
- mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp);
- /* A(-4)*B(-4) */
- TOOM6H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); /* A(+4)*B(+4) */
- mpn_toom_couple_handling (r1, 2 * n + 1, pp, sign, n, 2, 4);
-
- /* $\pm1/4$ */
- sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^
- mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp);
- /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
- TOOM6H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half));
-
- /* $\pm2$ */
- sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^
- mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp);
- /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
- TOOM6H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 1, 2);
-
-#undef v0
-#undef v1
-#undef v2
-#undef v3
-#undef wse
-
- /* A(0)*B(0) */
- TOOM6H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi);
-
- /* Infinity */
- if (UNLIKELY (half != 0)) {
- if (s > t) {
- TOOM6H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi);
- } else {
- TOOM6H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi);
- };
- };
-
- mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, s+t, half, wsi);
-
-#undef r0
-#undef r1
-#undef r2
-#undef r3
-#undef r4
-#undef r5
-#undef wsi
-}
-
-#undef TOOM6H_MUL_N_REC
-#undef TOOM6H_MUL_REC
-#undef MAYBE_mul_basecase
-#undef MAYBE_mul_toom22
-#undef MAYBE_mul_toom33
-#undef MAYBE_mul_toom6h
diff --git a/gmp/mpn/generic/toom8_sqr.c b/gmp/mpn/generic/toom8_sqr.c
deleted file mode 100644
index 0c93678815..0000000000
--- a/gmp/mpn/generic/toom8_sqr.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Implementation of the squaring algorithm with Toom-Cook 8.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if GMP_NUMB_BITS < 29
-#error Not implemented.
-#endif
-
-#if GMP_NUMB_BITS < 43
-#define BIT_CORRECTION 1
-#define CORRECTION_BITS GMP_NUMB_BITS
-#else
-#define BIT_CORRECTION 0
-#define CORRECTION_BITS 0
-#endif
-
-#ifndef SQR_TOOM8_THRESHOLD
-#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD
-#endif
-
-#ifndef SQR_TOOM6_THRESHOLD
-#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD
-#endif
-
-#if TUNE_PROGRAM_BUILD
-#define MAYBE_sqr_basecase 1
-#define MAYBE_sqr_above_basecase 1
-#define MAYBE_sqr_toom2 1
-#define MAYBE_sqr_above_toom2 1
-#define MAYBE_sqr_toom3 1
-#define MAYBE_sqr_above_toom3 1
-#define MAYBE_sqr_toom4 1
-#define MAYBE_sqr_above_toom4 1
-#define MAYBE_sqr_above_toom6 1
-#else
-#define SQR_TOOM8_MAX \
- ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ? \
- ((SQR_FFT_THRESHOLD+8*2-1+7)/8) \
- : MP_SIZE_T_MAX )
-#define MAYBE_sqr_basecase \
- (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD)
-#define MAYBE_sqr_above_basecase \
- (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD)
-#define MAYBE_sqr_toom2 \
- (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD)
-#define MAYBE_sqr_above_toom2 \
- (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD)
-#define MAYBE_sqr_toom3 \
- (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD)
-#define MAYBE_sqr_above_toom3 \
- (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD)
-#define MAYBE_sqr_toom4 \
- (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD)
-#define MAYBE_sqr_above_toom4 \
- (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD)
-#define MAYBE_sqr_above_toom6 \
- (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD)
-#endif
-
-#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws) \
- do { \
- if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \
- || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) { \
- mpn_sqr_basecase (p, a, n); \
- if (f) mpn_sqr_basecase (p2, a2, n); \
- } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \
- || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) { \
- mpn_toom2_sqr (p, a, n, ws); \
- if (f) mpn_toom2_sqr (p2, a2, n, ws); \
- } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \
- || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) { \
- mpn_toom3_sqr (p, a, n, ws); \
- if (f) mpn_toom3_sqr (p2, a2, n, ws); \
- } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4 \
- || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) { \
- mpn_toom4_sqr (p, a, n, ws); \
- if (f) mpn_toom4_sqr (p2, a2, n, ws); \
- } else if (! MAYBE_sqr_above_toom6 \
- || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { \
- mpn_toom6_sqr (p, a, n, ws); \
- if (f) mpn_toom6_sqr (p2, a2, n, ws); \
- } else { \
- mpn_toom8_sqr (p, a, n, ws); \
- if (f) mpn_toom8_sqr (p2, a2, n, ws); \
- } \
- } while (0)
-
-void
-mpn_toom8_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
-{
- mp_size_t n, s;
-
- /***************************** decomposition *******************************/
-
- ASSERT ( an >= 40 );
-
- n = 1 + ((an - 1)>>3);
-
- s = an - 7 * n;
-
- ASSERT (0 < s && s <= n);
- ASSERT ( s + s > 3 );
-
-#define r6 (pp + 3 * n) /* 3n+1 */
-#define r4 (pp + 7 * n) /* 3n+1 */
-#define r2 (pp +11 * n) /* 3n+1 */
-#define r0 (pp +15 * n) /* s+t <= 2*n */
-#define r7 (scratch) /* 3n+1 */
-#define r5 (scratch + 3 * n + 1) /* 3n+1 */
-#define r3 (scratch + 6 * n + 2) /* 3n+1 */
-#define r1 (scratch + 9 * n + 3) /* 3n+1 */
-#define v0 (pp +11 * n) /* n+1 */
-#define v2 (pp +13 * n+2) /* n+1 */
-#define wse (scratch +12 * n + 4) /* 3n+1 */
-
- /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may
- need all of them, when DO_mpn_sublsh_n usea a scratch */
-/* if (scratch == NULL) */
-/* scratch = TMP_SALLOC_LIMBS (30 * n + 6); */
-
- /********************** evaluation and recursive calls *********************/
- /* $\pm1/8$ */
- mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp);
- /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */
- TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse);
- mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0);
-
- /* $\pm1/4$ */
- mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp);
- /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
- TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse);
- mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0);
-
- /* $\pm2$ */
- mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp);
- /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
- TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse);
- mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2);
-
- /* $\pm8$ */
- mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp);
- /* A(-8)*B(-8) */ /* A(+8)*B(+8) */
- TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse);
- mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6);
-
- /* $\pm1/2$ */
- mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp);
- /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
- TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse);
- mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0);
-
- /* $\pm1$ */
- mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s, pp);
- /* A(-1)*B(-1) */ /* A(1)*B(1) */
- TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse);
- mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0);
-
- /* $\pm4$ */
- mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp);
- /* A(-4)*B(-4) */ /* A(+4)*B(+4) */
- TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse);
- mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4);
-
-#undef v0
-#undef v2
-
- /* A(0)*B(0) */
- TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse);
-
- mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse);
-
-#undef r0
-#undef r1
-#undef r2
-#undef r3
-#undef r4
-#undef r5
-#undef r6
-#undef wse
-
-}
-
-#undef TOOM8_SQR_REC
-#undef MAYBE_sqr_basecase
-#undef MAYBE_sqr_above_basecase
-#undef MAYBE_sqr_toom2
-#undef MAYBE_sqr_above_toom2
-#undef MAYBE_sqr_toom3
-#undef MAYBE_sqr_above_toom3
-#undef MAYBE_sqr_above_toom4
diff --git a/gmp/mpn/generic/toom8h_mul.c b/gmp/mpn/generic/toom8h_mul.c
deleted file mode 100644
index 8f593903f5..0000000000
--- a/gmp/mpn/generic/toom8h_mul.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Implementation of the multiplication algorithm for Toom-Cook 8.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#if GMP_NUMB_BITS < 29
-#error Not implemented.
-#endif
-
-#if GMP_NUMB_BITS < 43
-#define BIT_CORRECTION 1
-#define CORRECTION_BITS GMP_NUMB_BITS
-#else
-#define BIT_CORRECTION 0
-#define CORRECTION_BITS 0
-#endif
-
-
-#if TUNE_PROGRAM_BUILD
-#define MAYBE_mul_basecase 1
-#define MAYBE_mul_toom22 1
-#define MAYBE_mul_toom33 1
-#define MAYBE_mul_toom44 1
-#define MAYBE_mul_toom8h 1
-#else
-#define MAYBE_mul_basecase \
- (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM22_THRESHOLD)
-#define MAYBE_mul_toom22 \
- (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM33_THRESHOLD)
-#define MAYBE_mul_toom33 \
- (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM44_THRESHOLD)
-#define MAYBE_mul_toom44 \
- (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM6H_THRESHOLD)
-#define MAYBE_mul_toom8h \
- (MUL_FFT_THRESHOLD >= 8 * MUL_TOOM8H_THRESHOLD)
-#endif
-
-#define TOOM8H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \
- do { \
- if (MAYBE_mul_basecase \
- && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \
- mpn_mul_basecase (p, a, n, b, n); \
- if (f) mpn_mul_basecase (p2, a2, n, b2, n); \
- } else if (MAYBE_mul_toom22 \
- && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \
- mpn_toom22_mul (p, a, n, b, n, ws); \
- if (f) mpn_toom22_mul (p2, a2, n, b2, n, ws); \
- } else if (MAYBE_mul_toom33 \
- && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \
- mpn_toom33_mul (p, a, n, b, n, ws); \
- if (f) mpn_toom33_mul (p2, a2, n, b2, n, ws); \
- } else if (MAYBE_mul_toom44 \
- && BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \
- mpn_toom44_mul (p, a, n, b, n, ws); \
- if (f) mpn_toom44_mul (p2, a2, n, b2, n, ws); \
- } else if (! MAYBE_mul_toom8h \
- || BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { \
- mpn_toom6h_mul (p, a, n, b, n, ws); \
- if (f) mpn_toom6h_mul (p2, a2, n, b2, n, ws); \
- } else { \
- mpn_toom8h_mul (p, a, n, b, n, ws); \
- if (f) mpn_toom8h_mul (p2, a2, n, b2, n, ws); \
- } \
- } while (0)
-
-#define TOOM8H_MUL_REC(p, a, na, b, nb, ws) \
- do { mpn_mul (p, a, na, b, nb); } while (0)
-
-/* Toom-8.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn}
- With: an >= bn >= 86, an*5 < bn * 11.
- It _may_ work with bn<=?? and bn*?? < an*? < bn*??
-
- Evaluate in: infinity, +8,-8,+4,-4,+2,-2,+1,-1,+1/2,-1/2,+1/4,-1/4,+1/8,-1/8,0.
-*/
-/* Estimate on needed scratch:
- S(n) <= (n+7)\8*13+5+MAX(S((n+7)\8),1+2*(n+7)\8),
- since n>80; S(n) <= ceil(log(n/10)/log(8))*(13+5)+n*15\8 < n*15\8 + lg2(n)*6
- */
-
-void
-mpn_toom8h_mul (mp_ptr pp,
- mp_srcptr ap, mp_size_t an,
- mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
-{
- mp_size_t n, s, t;
- int p, q, half;
- int sign;
-
- /***************************** decomposition *******************************/
-
- ASSERT (an >= bn);
- /* Can not handle too small operands */
- ASSERT (bn >= 86);
- /* Can not handle too much unbalancement */
- ASSERT (an <= bn*4);
- ASSERT (GMP_NUMB_BITS > 11*3 || an*4 <= bn*11);
- ASSERT (GMP_NUMB_BITS > 10*3 || an*1 <= bn* 2);
- ASSERT (GMP_NUMB_BITS > 9*3 || an*2 <= bn* 3);
-
- /* Limit num/den is a rational number between
- (16/15)^(log(6)/log(2*6-1)) and (16/15)^(log(8)/log(2*8-1)) */
-#define LIMIT_numerator (21)
-#define LIMIT_denominat (20)
-
- if (LIKELY (an == bn) || an * (LIMIT_denominat>>1) < LIMIT_numerator * (bn>>1) ) /* is 8*... < 8*... */
- {
- half = 0;
- n = 1 + ((an - 1)>>3);
- p = q = 7;
- s = an - 7 * n;
- t = bn - 7 * n;
- }
- else
- {
- if (an * 13 < 16 * bn) /* (an*7*LIMIT_numerator<LIMIT_denominat*9*bn) */
- { p = 9; q = 8; }
- else if (GMP_NUMB_BITS <= 9*3 ||
- an *(LIMIT_denominat>>1) < (LIMIT_numerator/7*9) * (bn>>1))
- { p = 9; q = 7; }
- else if (an * 10 < 33 * (bn>>1)) /* (an*3*LIMIT_numerator<LIMIT_denominat*5*bn) */
- { p =10; q = 7; }
- else if (GMP_NUMB_BITS <= 10*3 ||
- an * (LIMIT_denominat/5) < (LIMIT_numerator/3) * bn)
- { p =10; q = 6; }
- else if (an * 6 < 13 * bn) /*(an * 5 * LIMIT_numerator < LIMIT_denominat *11 * bn)*/
- { p =11; q = 6; }
- else if (GMP_NUMB_BITS <= 11*3 ||
- an * 4 < 9 * bn)
- { p =11; q = 5; }
- else if (an *(LIMIT_numerator/3) < LIMIT_denominat * bn) /* is 4*... <12*... */
- { p =12; q = 5; }
- else if (GMP_NUMB_BITS <= 12*3 ||
- an * 9 < 28 * bn ) /* is 4*... <12*... */
- { p =12; q = 4; }
- else
- { p =13; q = 4; }
-
- half = (p+q)&1;
- n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q);
- p--; q--;
-
- s = an - p * n;
- t = bn - q * n;
-
- if(half) { /* Recover from badly chosen splitting */
- if (UNLIKELY (s<1)) {p--; s+=n; half=0;}
- else if (UNLIKELY (t<1)) {q--; t+=n; half=0;}
- }
- }
-#undef LIMIT_numerator
-#undef LIMIT_denominat
-
- ASSERT (0 < s && s <= n);
- ASSERT (0 < t && t <= n);
- ASSERT (half || s + t > 3);
- ASSERT (n > 2);
-
-#define r6 (pp + 3 * n) /* 3n+1 */
-#define r4 (pp + 7 * n) /* 3n+1 */
-#define r2 (pp +11 * n) /* 3n+1 */
-#define r0 (pp +15 * n) /* s+t <= 2*n */
-#define r7 (scratch) /* 3n+1 */
-#define r5 (scratch + 3 * n + 1) /* 3n+1 */
-#define r3 (scratch + 6 * n + 2) /* 3n+1 */
-#define r1 (scratch + 9 * n + 3) /* 3n+1 */
-#define v0 (pp +11 * n) /* n+1 */
-#define v1 (pp +12 * n+1) /* n+1 */
-#define v2 (pp +13 * n+2) /* n+1 */
-#define v3 (scratch +12 * n + 4) /* n+1 */
-#define wsi (scratch +12 * n + 4) /* 3n+1 */
-#define wse (scratch +13 * n + 5) /* 2n+1 */
-
- /* Alloc also 3n+1 limbs for wsi... toom_interpolate_16pts may
- need all of them */
-/* if (scratch == NULL) */
-/* scratch = TMP_SALLOC_LIMBS(mpn_toom8_sqr_itch(n * 8)); */
- ASSERT (15 * n + 6 <= mpn_toom8h_mul_itch (an, bn));
- ASSERT (15 * n + 6 <= mpn_toom8_sqr_itch (n * 8));
-
- /********************** evaluation and recursive calls *********************/
-
- /* $\pm1/8$ */
- sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 3, pp) ^
- mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 3, pp);
- /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r7, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3*(1+half), 3*(half));
-
- /* $\pm1/4$ */
- sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^
- mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp);
- /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half));
-
- /* $\pm2$ */
- sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^
- mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp);
- /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 1, 2);
-
- /* $\pm8$ */
- sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 3, pp) ^
- mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 3, pp);
- /* A(-8)*B(-8) */ /* A(+8)*B(+8) */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3, 6);
-
- /* $\pm1/2$ */
- sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^
- mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp);
- /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r6, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r6, 2 * n + 1, pp, sign, n, 1+half, half);
-
- /* $\pm1$ */
- sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp);
- if (GMP_NUMB_BITS > 12*3 && UNLIKELY (q == 3))
- sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp);
- else
- sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp);
- /* A(-1)*B(-1) */ /* A(1)*B(1) */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 0, 0);
-
- /* $\pm4$ */
- sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^
- mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp);
- /* A(-4)*B(-4) */ /* A(+4)*B(+4) */
- TOOM8H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse);
- mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 2, 4);
-
-#undef v0
-#undef v1
-#undef v2
-#undef v3
-#undef wse
-
- /* A(0)*B(0) */
- TOOM8H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi);
-
- /* Infinity */
- if (UNLIKELY (half != 0)) {
- if (s > t) {
- TOOM8H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi);
- } else {
- TOOM8H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi);
- };
- };
-
- mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, s+t, half, wsi);
-
-#undef r0
-#undef r1
-#undef r2
-#undef r3
-#undef r4
-#undef r5
-#undef r6
-#undef wsi
-}
-
-#undef TOOM8H_MUL_N_REC
-#undef TOOM8H_MUL_REC
-#undef MAYBE_mul_basecase
-#undef MAYBE_mul_toom22
-#undef MAYBE_mul_toom33
-#undef MAYBE_mul_toom44
-#undef MAYBE_mul_toom8h
diff --git a/gmp/mpn/generic/toom_couple_handling.c b/gmp/mpn/generic/toom_couple_handling.c
deleted file mode 100644
index 9e62bcba1c..0000000000
--- a/gmp/mpn/generic/toom_couple_handling.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Helper function for high degree Toom-Cook algorithms.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Gets {pp,n} and (sign?-1:1)*{np,n}. Computes at once:
- {pp,n} <- ({pp,n}+{np,n})/2^{ps+1}
- {pn,n} <- ({pp,n}-{np,n})/2^{ns+1}
- Finally recompose them obtaining:
- {pp,n+off} <- {pp,n}+{np,n}*2^{off*GMP_NUMB_BITS}
-*/
-void
-mpn_toom_couple_handling (mp_ptr pp, mp_size_t n, mp_ptr np,
- int nsign, mp_size_t off, int ps, int ns)
-{
- if (nsign) {
-#ifdef HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (np, pp, np, n);
-#else
- mpn_sub_n (np, pp, np, n);
- mpn_rshift (np, np, n, 1);
-#endif
- } else {
-#ifdef HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (np, pp, np, n);
-#else
- mpn_add_n (np, pp, np, n);
- mpn_rshift (np, np, n, 1);
-#endif
- }
-
-#ifdef HAVE_NATIVE_mpn_rsh1sub_n
- if (ps == 1)
- mpn_rsh1sub_n (pp, pp, np, n);
- else
-#endif
- {
- mpn_sub_n (pp, pp, np, n);
- if (ps > 0)
- mpn_rshift (pp, pp, n, ps);
- }
- if (ns > 0)
- mpn_rshift (np, np, n, ns);
- pp[n] = mpn_add_n (pp+off, pp+off, np, n-off);
- ASSERT_NOCARRY (mpn_add_1(pp+n, np+n-off, off, pp[n]) );
-}
diff --git a/gmp/mpn/generic/toom_eval_dgr3_pm1.c b/gmp/mpn/generic/toom_eval_dgr3_pm1.c
deleted file mode 100644
index 50411bd3ca..0000000000
--- a/gmp/mpn/generic/toom_eval_dgr3_pm1.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* mpn_toom_eval_dgr3_pm1 -- Evaluate a degree 3 polynomial in +1 and -1
-
- Contributed to the GNU project by Niels Möller
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-int
-mpn_toom_eval_dgr3_pm1 (mp_ptr xp1, mp_ptr xm1,
- mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)
-{
- int neg;
-
- ASSERT (x3n > 0);
- ASSERT (x3n <= n);
-
- xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n);
- tp[n] = mpn_add (tp, xp + n, n, xp + 3*n, x3n);
-
- neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1);
- else
- mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1);
-#else
- if (neg)
- mpn_sub_n (xm1, tp, xp1, n + 1);
- else
- mpn_sub_n (xm1, xp1, tp, n + 1);
-
- mpn_add_n (xp1, xp1, tp, n + 1);
-#endif
-
- ASSERT (xp1[n] <= 3);
- ASSERT (xm1[n] <= 1);
-
- return neg;
-}
diff --git a/gmp/mpn/generic/toom_eval_dgr3_pm2.c b/gmp/mpn/generic/toom_eval_dgr3_pm2.c
deleted file mode 100644
index 3ba6d15f3d..0000000000
--- a/gmp/mpn/generic/toom_eval_dgr3_pm2.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* mpn_toom_eval_dgr3_pm2 -- Evaluate a degree 3 polynomial in +2 and -2
-
- Contributed to the GNU project by Niels Möller
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Needs n+1 limbs of temporary storage. */
-int
-mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2,
- mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)
-{
- mp_limb_t cy;
- int neg;
-
- ASSERT (x3n > 0);
- ASSERT (x3n <= n);
-
- /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */
-#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
-#if HAVE_NATIVE_mpn_addlsh2_n
- xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n);
-
- cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n);
-#else /* HAVE_NATIVE_mpn_addlsh_n */
- xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2);
-
- cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2);
-#endif
- if (x3n < n)
- cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy);
- tp[n] = cy;
-#else
- cy = mpn_lshift (tp, xp + 2*n, n, 2);
- xp2[n] = cy + mpn_add_n (xp2, tp, xp, n);
-
- tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2);
- if (x3n < n)
- tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1);
- else
- tp[n] += mpn_add_n (tp, xp + n, tp, n);
-#endif
- mpn_lshift (tp, tp, n+1, 1);
-
- neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
- else
- mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
-#else
- if (neg)
- mpn_sub_n (xm2, tp, xp2, n + 1);
- else
- mpn_sub_n (xm2, xp2, tp, n + 1);
-
- mpn_add_n (xp2, xp2, tp, n + 1);
-#endif
-
- ASSERT (xp2[n] < 15);
- ASSERT (xm2[n] < 10);
-
- return neg;
-}
diff --git a/gmp/mpn/generic/toom_eval_pm1.c b/gmp/mpn/generic/toom_eval_pm1.c
deleted file mode 100644
index 2334b0aff4..0000000000
--- a/gmp/mpn/generic/toom_eval_pm1.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/* mpn_toom_eval_pm1 -- Evaluate a polynomial in +1 and -1
-
- Contributed to the GNU project by Niels Möller
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */
-int
-mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k,
- mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp)
-{
- unsigned i;
- int neg;
-
- ASSERT (k >= 4);
-
- ASSERT (hn > 0);
- ASSERT (hn <= n);
-
- /* The degree k is also the number of full-size coefficients, so
- * that last coefficient, of size hn, starts at xp + k*n. */
-
- xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n);
- for (i = 4; i < k; i += 2)
- ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n));
-
- tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n);
- for (i = 5; i < k; i += 2)
- ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n));
-
- if (k & 1)
- ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn));
- else
- ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn));
-
- neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1);
- else
- mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1);
-#else
- if (neg)
- mpn_sub_n (xm1, tp, xp1, n + 1);
- else
- mpn_sub_n (xm1, xp1, tp, n + 1);
-
- mpn_add_n (xp1, xp1, tp, n + 1);
-#endif
-
- ASSERT (xp1[n] <= k);
- ASSERT (xm1[n] <= k/2 + 1);
-
- return neg;
-}
diff --git a/gmp/mpn/generic/toom_eval_pm2.c b/gmp/mpn/generic/toom_eval_pm2.c
deleted file mode 100644
index 67afcc638e..0000000000
--- a/gmp/mpn/generic/toom_eval_pm2.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/* mpn_toom_eval_pm2 -- Evaluate a polynomial in +2 and -2
-
- Contributed to the GNU project by Niels Möller and Marco Bodrato
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* DO_addlsh2(d,a,b,n,cy) computes cy,{d,n} <- {a,n} + 4*(cy,{b,n}), it
- can be used as DO_addlsh2(d,a,d,n,d[n]), for accumulation on {d,n+1}. */
-#if HAVE_NATIVE_mpn_addlsh2_n
-#define DO_addlsh2(d, a, b, n, cy) \
-do { \
- (cy) <<= 2; \
- (cy) += mpn_addlsh2_n(d, a, b, n); \
-} while (0)
-#else
-#if HAVE_NATIVE_mpn_addlsh_n
-#define DO_addlsh2(d, a, b, n, cy) \
-do { \
- (cy) <<= 2; \
- (cy) += mpn_addlsh_n(d, a, b, n, 2); \
-} while (0)
-#else
-/* The following is not a general substitute for addlsh2.
- It is correct if d == b, but it is not if d == a. */
-#define DO_addlsh2(d, a, b, n, cy) \
-do { \
- (cy) <<= 2; \
- (cy) += mpn_lshift(d, b, n, 2); \
- (cy) += mpn_add_n(d, d, a, n); \
-} while (0)
-#endif
-#endif
-
-/* Evaluates a polynomial of degree 2 < k < GMP_NUMB_BITS, in the
- points +2 and -2. */
-int
-mpn_toom_eval_pm2 (mp_ptr xp2, mp_ptr xm2, unsigned k,
- mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp)
-{
- int i;
- int neg;
- mp_limb_t cy;
-
- ASSERT (k >= 3);
- ASSERT (k < GMP_NUMB_BITS);
-
- ASSERT (hn > 0);
- ASSERT (hn <= n);
-
- /* The degree k is also the number of full-size coefficients, so
- * that last coefficient, of size hn, starts at xp + k*n. */
-
- cy = 0;
- DO_addlsh2 (xp2, xp + (k-2) * n, xp + k * n, hn, cy);
- if (hn != n)
- cy = mpn_add_1 (xp2 + hn, xp + (k-2) * n + hn, n - hn, cy);
- for (i = k - 4; i >= 0; i -= 2)
- DO_addlsh2 (xp2, xp + i * n, xp2, n, cy);
- xp2[n] = cy;
-
- k--;
-
- cy = 0;
- DO_addlsh2 (tp, xp + (k-2) * n, xp + k * n, n, cy);
- for (i = k - 4; i >= 0; i -= 2)
- DO_addlsh2 (tp, xp + i * n, tp, n, cy);
- tp[n] = cy;
-
- if (k & 1)
- ASSERT_NOCARRY(mpn_lshift (tp , tp , n + 1, 1));
- else
- ASSERT_NOCARRY(mpn_lshift (xp2, xp2, n + 1, 1));
-
- neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
- else
- mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
-#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
- if (neg)
- mpn_sub_n (xm2, tp, xp2, n + 1);
- else
- mpn_sub_n (xm2, xp2, tp, n + 1);
-
- mpn_add_n (xp2, xp2, tp, n + 1);
-#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
-
- ASSERT (xp2[n] < (1<<(k+2))-1);
- ASSERT (xm2[n] < ((1<<(k+3))-1 - (1^k&1))/3);
-
- neg ^= ((k & 1) - 1);
-
- return neg;
-}
-
-#undef DO_addlsh2
diff --git a/gmp/mpn/generic/toom_eval_pm2exp.c b/gmp/mpn/generic/toom_eval_pm2exp.c
deleted file mode 100644
index b178fcac24..0000000000
--- a/gmp/mpn/generic/toom_eval_pm2exp.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/* mpn_toom_eval_pm2exp -- Evaluate a polynomial in +2^k and -2^k
-
- Contributed to the GNU project by Niels Möller
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */
-int
-mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k,
- mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift,
- mp_ptr tp)
-{
- unsigned i;
- int neg;
-#if HAVE_NATIVE_mpn_addlsh_n
- mp_limb_t cy;
-#endif
-
- ASSERT (k >= 3);
- ASSERT (shift*k < GMP_NUMB_BITS);
-
- ASSERT (hn > 0);
- ASSERT (hn <= n);
-
- /* The degree k is also the number of full-size coefficients, so
- * that last coefficient, of size hn, starts at xp + k*n. */
-
-#if HAVE_NATIVE_mpn_addlsh_n
- xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift);
- for (i = 4; i < k; i += 2)
- xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift);
-
- tp[n] = mpn_lshift (tp, xp+n, n, shift);
- for (i = 3; i < k; i+= 2)
- tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift);
-
- if (k & 1)
- {
- cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift);
- MPN_INCR_U (tp + hn, n+1 - hn, cy);
- }
- else
- {
- cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift);
- MPN_INCR_U (xp2 + hn, n+1 - hn, cy);
- }
-
-#else /* !HAVE_NATIVE_mpn_addlsh_n */
- xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift);
- xp2[n] += mpn_add_n (xp2, xp, tp, n);
- for (i = 4; i < k; i += 2)
- {
- xp2[n] += mpn_lshift (tp, xp + i*n, n, i*shift);
- xp2[n] += mpn_add_n (xp2, xp2, tp, n);
- }
-
- tp[n] = mpn_lshift (tp, xp+n, n, shift);
- for (i = 3; i < k; i+= 2)
- {
- tp[n] += mpn_lshift (xm2, xp + i*n, n, i*shift);
- tp[n] += mpn_add_n (tp, tp, xm2, n);
- }
-
- xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k*shift);
- if (k & 1)
- mpn_add (tp, tp, n+1, xm2, hn+1);
- else
- mpn_add (xp2, xp2, n+1, xm2, hn+1);
-#endif /* !HAVE_NATIVE_mpn_addlsh_n */
-
- neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
- else
- mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
-#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
- if (neg)
- mpn_sub_n (xm2, tp, xp2, n + 1);
- else
- mpn_sub_n (xm2, xp2, tp, n + 1);
-
- mpn_add_n (xp2, xp2, tp, n + 1);
-#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
-
- /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */
- ASSERT ((k+1)*shift >= GMP_LIMB_BITS ||
- xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1));
- ASSERT ((k+2)*shift >= GMP_LIMB_BITS ||
- xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1));
-
- return neg;
-}
diff --git a/gmp/mpn/generic/toom_eval_pm2rexp.c b/gmp/mpn/generic/toom_eval_pm2rexp.c
deleted file mode 100644
index 3cac46bd90..0000000000
--- a/gmp/mpn/generic/toom_eval_pm2rexp.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/* mpn_toom_eval_pm2rexp -- Evaluate a polynomial in +2^-k and -2^-k
-
- Contributed to the GNU project by Marco Bodrato
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if HAVE_NATIVE_mpn_addlsh_n
-#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift(ws,src,n,s);
- return __cy + mpn_add_n(dst,dst,ws,n);
-#endif
-}
-#endif
-
-/* Evaluates a polynomial of degree k >= 3. */
-int
-mpn_toom_eval_pm2rexp (mp_ptr rp, mp_ptr rm,
- unsigned int q, mp_srcptr ap, mp_size_t n, mp_size_t t,
- unsigned int s, mp_ptr ws)
-{
- unsigned int i;
- int neg;
- /* {ap,q*n+t} -> {rp,n+1} {rm,n+1} , with {ws, n+1}*/
- ASSERT (n >= t);
- ASSERT (s != 0); /* or _eval_pm1 should be used */
- ASSERT (q > 1);
- ASSERT (s*q < GMP_NUMB_BITS);
- rp[n] = mpn_lshift(rp, ap, n, s*q);
- ws[n] = mpn_lshift(ws, ap+n, n, s*(q-1));
- if( (q & 1) != 0) {
- ASSERT_NOCARRY(mpn_add(ws,ws,n+1,ap+n*q,t));
- rp[n] += DO_mpn_addlsh_n(rp, ap+n*(q-1), n, s, rm);
- } else {
- ASSERT_NOCARRY(mpn_add(rp,rp,n+1,ap+n*q,t));
- }
- for(i=2; i<q-1; i++)
- {
- rp[n] += DO_mpn_addlsh_n(rp, ap+n*i, n, s*(q-i), rm);
- i++;
- ws[n] += DO_mpn_addlsh_n(ws, ap+n*i, n, s*(q-i), rm);
- };
-
- neg = (mpn_cmp (rp, ws, n + 1) < 0) ? ~0 : 0;
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- if (neg)
- mpn_add_n_sub_n (rp, rm, ws, rp, n + 1);
- else
- mpn_add_n_sub_n (rp, rm, rp, ws, n + 1);
-#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
- if (neg)
- mpn_sub_n (rm, ws, rp, n + 1);
- else
- mpn_sub_n (rm, rp, ws, n + 1);
-
- ASSERT_NOCARRY (mpn_add_n (rp, rp, ws, n + 1));
-#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
-
- return neg;
-}
diff --git a/gmp/mpn/generic/toom_interpolate_12pts.c b/gmp/mpn/generic/toom_interpolate_12pts.c
deleted file mode 100644
index 180b0329a3..0000000000
--- a/gmp/mpn/generic/toom_interpolate_12pts.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Interpolation for the algorithm Toom-Cook 6.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-
-#if HAVE_NATIVE_mpn_sublsh_n
-#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift(ws,src,n,s);
- return __cy + mpn_sub_n(dst,dst,ws,n);
-#endif
-}
-#endif
-
-#if HAVE_NATIVE_mpn_addlsh_n
-#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift(ws,src,n,s);
- return __cy + mpn_add_n(dst,dst,ws,n);
-#endif
-}
-#endif
-
-#if HAVE_NATIVE_mpn_subrsh
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
-#else
-/* FIXME: This is not a correct definition, it assumes no carry */
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
-do { \
- mp_limb_t __cy; \
- MPN_DECR_U (dst, nd, src[0] >> s); \
- __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
- MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
-} while (0)
-#endif
-
-
-#if GMP_NUMB_BITS < 21
-#error Not implemented: Both sublsh_n(,,,20) should be corrected.
-#endif
-
-#if GMP_NUMB_BITS < 16
-#error Not implemented: divexact_by42525 needs splitting.
-#endif
-
-#if GMP_NUMB_BITS < 12
-#error Not implemented: Hard to adapt...
-#endif
-
-/* FIXME: tuneup should decide the best variant */
-#ifndef AORSMUL_FASTER_AORS_AORSLSH
-#define AORSMUL_FASTER_AORS_AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_AORS_2AORSLSH
-#define AORSMUL_FASTER_AORS_2AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_2AORSLSH
-#define AORSMUL_FASTER_2AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_3AORSLSH
-#define AORSMUL_FASTER_3AORSLSH 1
-#endif
-
-#define BINVERT_9 \
- ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
-
-#define BINVERT_255 \
- (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
-
- /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
-#if GMP_LIMB_BITS == 32
-#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
-#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
-#else
-#if GMP_LIMB_BITS == 64
-#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
-#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
-#endif
-#endif
-
-#ifndef mpn_divexact_by255
-#if GMP_NUMB_BITS % 8 == 0
-#define mpn_divexact_by255(dst,src,size) \
- (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
-#else
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
-#else
-#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
-#endif
-#endif
-#endif
-
-#ifndef mpn_divexact_by9x4
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
-#else
-#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
-#endif
-#endif
-
-#ifndef mpn_divexact_by42525
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
-#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
-#else
-#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
-#endif
-#endif
-
-#ifndef mpn_divexact_by2835x4
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
-#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
-#else
-#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
-#endif
-#endif
-
-/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
- points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
- we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
- degree 11 (or 10), given the 12 (rsp. 11) values:
-
- r0 = limit at infinity of f(x) / x^7,
- r1 = f(4),f(-4),
- r2 = f(2),f(-2),
- r3 = f(1),f(-1),
- r4 = f(1/4),f(-1/4),
- r5 = f(1/2),f(-1/2),
- r6 = f(0).
-
- All couples of the form f(n),f(-n) must be already mixed with
- toom_couple_handling(f(n),...,f(-n),...)
-
- The result is stored in {pp, spt + 7*n (or 6*n)}.
- At entry, r6 is stored at {pp, 2n},
- r4 is stored at {pp + 3n, 3n + 1}.
- r2 is stored at {pp + 7n, 3n + 1}.
- r0 is stored at {pp +11n, spt}.
-
- The other values are 3n+1 limbs each (with most significant limbs small).
-
- Negative intermediate results are stored two-complemented.
- Inputs are destroyed.
-*/
-
-void
-mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
- mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
-{
- mp_limb_t cy;
- mp_size_t n3;
- mp_size_t n3p1;
- n3 = 3 * n;
- n3p1 = n3 + 1;
-
-#define r4 (pp + n3) /* 3n+1 */
-#define r2 (pp + 7 * n) /* 3n+1 */
-#define r0 (pp +11 * n) /* s+t <= 2*n */
-
- /******************************* interpolation *****************************/
- if (half != 0) {
- cy = mpn_sub_n (r3, r3, r0, spt);
- MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
-
- cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
- MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
- DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
-
- cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
- MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
- DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
- };
-
- r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
- DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
-#else
- ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
- mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
- MP_PTR_SWAP(r1, wsi);
-#endif
-
- r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
- DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
-#else
- mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
- ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
- MP_PTR_SWAP(r5, wsi);
-#endif
-
- r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
-
-#if AORSMUL_FASTER_AORS_AORSLSH
- mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
-#else
- mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
- DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
-#endif
- /* A division by 2835x4 follows. Warning: the operand can be negative! */
- mpn_divexact_by2835x4(r4, r4, n3p1);
- if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
- r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
-
-#if AORSMUL_FASTER_2AORSLSH
- mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
-#else
- DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
- DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
-#endif
- mpn_divexact_by255(r5, r5, n3p1);
-
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
-
-#if AORSMUL_FASTER_3AORSLSH
- ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
-#else
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
-#endif
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
- mpn_divexact_by42525(r1, r1, n3p1);
-
-#if AORSMUL_FASTER_AORS_2AORSLSH
- ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
-#else
- ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
- ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
-#endif
- mpn_divexact_by9x4(r2, r2, n3p1);
-
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
-
- mpn_sub_n (r4, r2, r4, n3p1);
- ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
- ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
-
- mpn_add_n (r5, r5, r1, n3p1);
- ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
-
- /* last interpolation steps... */
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
- ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
- /* ... could be mixed with recomposition
- ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
- */
-
- /***************************** recomposition *******************************/
- /*
- pp[] prior to operations:
- |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
-
- summation scheme for remaining operations:
- |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
- |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
- ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
- */
-
- cy = mpn_add_n (pp + n, pp + n, r5, n);
- cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
-#if HAVE_NATIVE_mpn_add_nc
- cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
-#else
- MPN_INCR_U (r5 + 2 * n, n + 1, cy);
- cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
-#endif
- MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
-
- pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
- cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
-#if HAVE_NATIVE_mpn_add_nc
- cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
-#else
- MPN_INCR_U (r3 + 2 * n, n + 1, cy);
- cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
-#endif
- MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
-
- pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
- if (half) {
- cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
-#if HAVE_NATIVE_mpn_add_nc
- if (LIKELY (spt > n)) {
- cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
- MPN_INCR_U (pp + 4 * n3, spt - n, cy);
- } else {
- ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
- }
-#else
- MPN_INCR_U (r1 + 2 * n, n + 1, cy);
- if (LIKELY (spt > n)) {
- cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
- MPN_INCR_U (pp + 4 * n3, spt - n, cy);
- } else {
- ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
- }
-#endif
- } else {
- ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
- }
-
-#undef r0
-#undef r2
-#undef r4
-}
diff --git a/gmp/mpn/generic/toom_interpolate_16pts.c b/gmp/mpn/generic/toom_interpolate_16pts.c
deleted file mode 100644
index 5afe6641f6..0000000000
--- a/gmp/mpn/generic/toom_interpolate_16pts.c
+++ /dev/null
@@ -1,527 +0,0 @@
-/* Interpolation for the algorithm Toom-Cook 8.5-way.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#if GMP_NUMB_BITS < 29
-#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
-#endif
-
-#if GMP_NUMB_BITS < 28
-#error Not implemented: divexact_by188513325 and _by182712915 will not work.
-#endif
-
-
-#if HAVE_NATIVE_mpn_sublsh_n
-#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift(ws,src,n,s);
- return __cy + mpn_sub_n(dst,dst,ws,n);
-#endif
-}
-#endif
-
-#if HAVE_NATIVE_mpn_addlsh_n
-#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift(ws,src,n,s);
- return __cy + mpn_add_n(dst,dst,ws,n);
-#endif
-}
-#endif
-
-#if HAVE_NATIVE_mpn_subrsh
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
-#else
-/* FIXME: This is not a correct definition, it assumes no carry */
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
-do { \
- mp_limb_t __cy; \
- MPN_DECR_U (dst, nd, src[0] >> s); \
- __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
- MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
-} while (0)
-#endif
-
-
-/* FIXME: tuneup should decide the best variant */
-#ifndef AORSMUL_FASTER_AORS_AORSLSH
-#define AORSMUL_FASTER_AORS_AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_AORS_2AORSLSH
-#define AORSMUL_FASTER_AORS_2AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_2AORSLSH
-#define AORSMUL_FASTER_2AORSLSH 1
-#endif
-#ifndef AORSMUL_FASTER_3AORSLSH
-#define AORSMUL_FASTER_3AORSLSH 1
-#endif
-
-#if GMP_NUMB_BITS < 43
-#define BIT_CORRECTION 1
-#define CORRECTION_BITS GMP_NUMB_BITS
-#else
-#define BIT_CORRECTION 0
-#define CORRECTION_BITS 0
-#endif
-
-#define BINVERT_9 \
- ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
-
-#define BINVERT_255 \
- (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
-
- /* FIXME: find some more general expressions for inverses */
-#if GMP_LIMB_BITS == 32
-#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
-#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
-#define BINVERT_182712915 (GMP_NUMB_MASK & CNST_LIMB(0x550659DB))
-#define BINVERT_188513325 (GMP_NUMB_MASK & CNST_LIMB(0xFBC333A5))
-#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25))
-#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B))
-#if GMP_NAIL_BITS == 0
-#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
-#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
-#else /* GMP_NAIL_BITS != 0 */
-#define BINVERT_255x182712915H \
- (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
-#define BINVERT_255x188513325H \
- (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
-#endif
-#else
-#if GMP_LIMB_BITS == 64
-#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
-#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
-#define BINVERT_255x182712915 (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25))
-#define BINVERT_255x188513325 (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B))
-#endif
-#endif
-
-#ifndef mpn_divexact_by255
-#if GMP_NUMB_BITS % 8 == 0
-#define mpn_divexact_by255(dst,src,size) \
- (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
-#else
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
-#else
-#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
-#endif
-#endif
-#endif
-
-#ifndef mpn_divexact_by255x4
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
-#else
-#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
-#endif
-#endif
-
-#ifndef mpn_divexact_by9x16
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
-#else
-#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
-#endif
-#endif
-
-#ifndef mpn_divexact_by42525x16
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
-#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
-#else
-#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
-#endif
-#endif
-
-#ifndef mpn_divexact_by2835x64
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
-#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
-#else
-#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
-#endif
-#endif
-
-#ifndef mpn_divexact_by255x182712915
-#if GMP_NUMB_BITS < 36
-#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
-/* FIXME: use mpn_bdiv_q_2_pi2 */
-#endif
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
-#define mpn_divexact_by255x182712915(dst,src,size) \
- do { \
- mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0); \
- mpn_divexact_by255(dst,dst,size); \
- } while(0)
-#else
-#define mpn_divexact_by255x182712915(dst,src,size) \
- do { \
- mpn_divexact_1(dst,src,size,CNST_LIMB(182712915)); \
- mpn_divexact_by255(dst,dst,size); \
- } while(0)
-#endif
-#else /* GMP_NUMB_BITS > 35 */
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
-#define mpn_divexact_by255x182712915(dst,src,size) \
- mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
-#else
-#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
-#endif
-#endif /* GMP_NUMB_BITS >?< 36 */
-#endif
-
-#ifndef mpn_divexact_by255x188513325
-#if GMP_NUMB_BITS < 36
-#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
-/* FIXME: use mpn_bdiv_q_1_pi2 */
-#endif
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
-#define mpn_divexact_by255x188513325(dst,src,size) \
- do { \
- mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0); \
- mpn_divexact_by255(dst,dst,size); \
- } while(0)
-#else
-#define mpn_divexact_by255x188513325(dst,src,size) \
- do { \
- mpn_divexact_1(dst,src,size,CNST_LIMB(188513325)); \
- mpn_divexact_by255(dst,dst,size); \
- } while(0)
-#endif
-#else /* GMP_NUMB_BITS > 35 */
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
-#define mpn_divexact_by255x188513325(dst,src,size) \
- mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
-#else
-#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
-#endif
-#endif /* GMP_NUMB_BITS >?< 36 */
-#endif
-
-/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
- points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
- +-1/8, 0. More precisely, we want to compute
- f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
- 14), given the 16 (rsp. 15) values:
-
- r0 = limit at infinity of f(x) / x^7,
- r1 = f(8),f(-8),
- r2 = f(4),f(-4),
- r3 = f(2),f(-2),
- r4 = f(1),f(-1),
- r5 = f(1/4),f(-1/4),
- r6 = f(1/2),f(-1/2),
- r7 = f(1/8),f(-1/8),
- r8 = f(0).
-
- All couples of the form f(n),f(-n) must be already mixed with
- toom_couple_handling(f(n),...,f(-n),...)
-
- The result is stored in {pp, spt + 7*n (or 8*n)}.
- At entry, r8 is stored at {pp, 2n},
- r6 is stored at {pp + 3n, 3n + 1}.
- r4 is stored at {pp + 7n, 3n + 1}.
- r2 is stored at {pp +11n, 3n + 1}.
- r0 is stored at {pp +15n, spt}.
-
- The other values are 3n+1 limbs each (with most significant limbs small).
-
- Negative intermediate results are stored two-complemented.
- Inputs are destroyed.
-*/
-
-void
-mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
- mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
-{
- mp_limb_t cy;
- mp_size_t n3;
- mp_size_t n3p1;
- n3 = 3 * n;
- n3p1 = n3 + 1;
-
-#define r6 (pp + n3) /* 3n+1 */
-#define r4 (pp + 7 * n) /* 3n+1 */
-#define r2 (pp +11 * n) /* 3n+1 */
-#define r0 (pp +15 * n) /* s+t <= 2*n */
-
- ASSERT( spt <= 2 * n );
- /******************************* interpolation *****************************/
- if( half != 0) {
- cy = mpn_sub_n (r4, r4, r0, spt);
- MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
-
- cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
- MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
- DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
-
- cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
- MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
- DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
-
- cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
-#if BIT_CORRECTION
- cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
- n3p1 - spt - BIT_CORRECTION, cy);
- ASSERT (BIT_CORRECTION > 0 || cy == 0);
- /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
- cy = r7[n3p1];
- r7[n3p1] = 0x80;
-#else
- MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
-#endif
- DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
-#if BIT_CORRECTION
- /* FIXME: assumes r7[n3p1] is writable. */
- ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
- r7[n3p1] = cy;
-#endif
- };
-
- r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
- DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
-#else
- mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
- ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
- MP_PTR_SWAP(r5, wsi);
-#endif
-
- r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
- DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
-#else
- ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
- mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
- MP_PTR_SWAP(r3, wsi);
-#endif
-
- cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
-#if BIT_CORRECTION
- MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
- cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
- cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
- ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
-#else
- r7[n3] -= cy;
- DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
-#endif
-
-#if HAVE_NATIVE_mpn_add_n_sub_n
- mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
-#else
- mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
- mpn_add_n (r1, r1, r7, n3p1); /* if BIT_CORRECTION != 0, can give a carry. */
- MP_PTR_SWAP(r7, wsi);
-#endif
-
- r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
-
-#if AORSMUL_FASTER_2AORSLSH
- mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
-#else
- DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
- DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
-#endif
-
- mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
-#if AORSMUL_FASTER_3AORSLSH
- mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
-#else
- DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
- DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
- DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
-#endif
- mpn_divexact_by255x188513325(r7, r7, n3p1);
-
- mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
- /* A division by 2835x64 follows. Warning: the operand can be negative! */
- mpn_divexact_by2835x64(r5, r5, n3p1);
- if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
- r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
-
-#if AORSMUL_FASTER_AORS_AORSLSH
- mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
-#else
- mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
- DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
-#endif
-#if AORSMUL_FASTER_2AORSLSH
- mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
-#else
- DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
- DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
-#endif
- /* A division by 255x4 follows. Warning: the operand can be negative! */
- mpn_divexact_by255x4(r6, r6, n3p1);
- if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
- r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
-
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
-
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
- ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
-
- /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
- DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
- mpn_submul_1 (r1, r2, n3p1, 1428);
- mpn_submul_1 (r1, r3, n3p1, 112896);
- mpn_divexact_by255x182712915(r1, r1, n3p1);
-
- ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
- mpn_divexact_by42525x16(r2, r2, n3p1);
-
-#if AORSMUL_FASTER_AORS_2AORSLSH
- ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
-#else
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
- ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
- ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
-#endif
- ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
- mpn_divexact_by9x16(r3, r3, n3p1);
-
- ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
- ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
- ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
-
- mpn_add_n (r6, r2, r6, n3p1);
- ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
- ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
-
- mpn_sub_n (r5, r3, r5, n3p1);
- ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
-
- mpn_add_n (r7, r1, r7, n3p1);
- ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
- ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
-
- /* last interpolation steps... */
- /* ... could be mixed with recomposition
- ||H-r7|M-r7|L-r7| ||H-r5|M-r5|L-r5|
- */
-
- /***************************** recomposition *******************************/
- /*
- pp[] prior to operations:
- |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
-
- summation scheme for remaining operations:
- |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
- |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
- ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| ||H r7|M r7|L r7|
- */
-
- cy = mpn_add_n (pp + n, pp + n, r7, n);
- cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
-#if HAVE_NATIVE_mpn_add_nc
- cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
-#else
- MPN_INCR_U (r7 + 2 * n, n + 1, cy);
- cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
-#endif
- MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
-
- pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
- cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
-#if HAVE_NATIVE_mpn_add_nc
- cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
-#else
- MPN_INCR_U (r5 + 2 * n, n + 1, cy);
- cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
-#endif
- MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
-
- pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
- cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
-#if HAVE_NATIVE_mpn_add_nc
- cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
-#else
- MPN_INCR_U (r3 + 2 * n, n + 1, cy);
- cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
-#endif
- MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
-
- pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
- if ( half ) {
- cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
-#if HAVE_NATIVE_mpn_add_nc
- if(LIKELY(spt > n)) {
- cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
- MPN_INCR_U (pp + 16 * n, spt - n, cy);
- } else {
- ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
- }
-#else
- MPN_INCR_U (r1 + 2 * n, n + 1, cy);
- if(LIKELY(spt > n)) {
- cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
- MPN_INCR_U (pp + 16 * n, spt - n, cy);
- } else {
- ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
- }
-#endif
- } else {
- ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
- }
-
-#undef r0
-#undef r2
-#undef r4
-#undef r6
-}
diff --git a/gmp/mpn/generic/toom_interpolate_5pts.c b/gmp/mpn/generic/toom_interpolate_5pts.c
index 9fa5f0b7a6..67260cc3d5 100644
--- a/gmp/mpn/generic/toom_interpolate_5pts.c
+++ b/gmp/mpn/generic/toom_interpolate_5pts.c
@@ -7,33 +7,23 @@
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2000-2003, 2005-2007, 2009 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2005, 2006, 2007 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -41,29 +31,28 @@ see https://www.gnu.org/licenses/. */
void
mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
mp_size_t k, mp_size_t twor, int sa,
- mp_limb_t vinf0)
+ mp_limb_t vinf0, mp_ptr ws)
{
mp_limb_t cy, saved;
- mp_size_t twok;
- mp_size_t kk1;
- mp_ptr c1, v1, c3, vinf;
-
- twok = k + k;
- kk1 = twok + 1;
+ mp_size_t twok = k + k;
+ mp_size_t kk1 = twok + 1;
+ mp_ptr c1, v1, c3, vinf, c5;
+ mp_limb_t cout; /* final carry, should be zero at the end */
c1 = c + k;
v1 = c1 + k;
c3 = v1 + k;
vinf = c3 + k;
+ c5 = vinf + k;
#define v0 (c)
/* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) =
thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0)
*/
- if (sa)
- ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
+ if (sa <= 0)
+ mpn_add_n (v2, v2, vm1, kk1);
else
- ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
+ mpn_sub_n (v2, v2, vm1, kk1);
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */
@@ -74,18 +63,17 @@ mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */
- /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
- tm1 >= 0 (0 1 0 1 0)
+ /* (2) vm1 <- tm1 := (v1 - sa*vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
+ tm1 >= 0 (0 1 0 1 0)
No carry comes out from {v1, kk1} +/- {vm1, kk1},
- and the division by two is exact.
- If (sa!=0) the sign of vm1 is negative */
- if (sa)
+ and the division by two is exact */
+ if (sa <= 0)
{
#ifdef HAVE_NATIVE_mpn_rsh1add_n
mpn_rsh1add_n (vm1, v1, vm1, kk1);
#else
- ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
- ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
+ mpn_add_n (vm1, v1, vm1, kk1);
+ mpn_rshift (vm1, vm1, kk1, 1);
#endif
}
else
@@ -93,8 +81,8 @@ mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
mpn_rsh1sub_n (vm1, v1, vm1, kk1);
#else
- ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
- ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
+ mpn_sub_n (vm1, v1, vm1, kk1);
+ mpn_rshift (vm1, vm1, kk1, 1);
#endif
}
@@ -115,8 +103,8 @@ mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
mpn_rsh1sub_n (v2, v2, v1, kk1);
#else
- ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
- ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
+ mpn_sub_n (v2, v2, v1, kk1);
+ mpn_rshift (v2, v2, kk1, 1);
#endif
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
@@ -125,75 +113,58 @@ mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
/* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
result is v1 >= 0
*/
- ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
+ mpn_sub_n (v1, v1, vm1, kk1);
- /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
- cy = mpn_add_n (c1, c1, vm1, kk1);
- MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
- /* Memory allocated for vm1 is now free, it can be recycled ...*/
+ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v0 v1-v0-tm1 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */
/* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
result is v2 >= 0 */
saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */
vinf[0] = vinf0; /* Set the right value for vinf0 */
-#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
- cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
+#ifdef HAVE_NATIVE_mpn_sublsh1_n
+ cy = mpn_sublsh1_n (v2, v2, vinf, twor);
#else
- /* Overwrite unused vm1 */
- cy = mpn_lshift (vm1, vinf, twor, 1);
- cy += mpn_sub_n (v2, v2, vm1, twor);
+ cy = mpn_lshift (ws, vinf, twor, 1);
+ cy += mpn_sub_n (v2, v2, ws, twor);
#endif
MPN_DECR_U (v2 + twor, kk1 - twor, cy);
- /* Current matrix is
- [1 0 0 0 0; vinf
- 0 1 0 0 0; v2
- 1 0 1 0 0; v1
- 0 1 0 1 0; vm1
- 0 0 0 0 1] v0
- Some values already are in-place (we added vm1 in the correct position)
- | vinf| v1 | v0 |
- | vm1 |
- One still is in a separated area
- | +v2 |
- We have to compute v1-=vinf; vm1 -= v2,
- |-vinf|
- | -v2 |
- Carefully reordering operations we can avoid to compute twice the sum
- of the high half of v2 plus the low half of vinf.
- */
-
- /* Add the high half of t2 in {vinf} */
- if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */
- cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
- MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
- } else { /* triggered only by very unbalanced cases like
- (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
- ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
- }
/* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
result is >= 0 */
- /* Side effect: we also subtracted (high half) vm1 -= v2 */
cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */
- vinf0 = vinf[0]; /* Save again the right value for vinf0 */
vinf[0] = saved;
MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */
+ __GMPN_ADD_1 (cout, vinf, vinf, twor, vinf0); /* Add vinf0, propagate carry. */
- /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
- Operate only on the low half.
+ /* (8) vm1 <- vm1-t2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
+ vm1 >= 0
*/
- cy = mpn_sub_n (c1, c1, v2, k);
- MPN_DECR_U (v1, kk1, cy);
+ mpn_sub_n (vm1, vm1, v2, kk1); /* No overlapping here. */
/********************* Beginning the final phase **********************/
- /* Most of the recomposition was done */
+ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */
+
+ /* (9) add t2 in {c+3k, ...} */
+ cy = mpn_add_n (c3, c3, v2, kk1);
+ __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
+
+ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+ v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */
+ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
+ v0 t1 vinf tm1 t2
+ +t2 */
+
+ /* add vm1 in {c+k, ...} */
+ cy = mpn_add_n (c1, c1, vm1, kk1);
+ __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
- /* add t2 in {c+3k, ...}, but only the low half */
- cy = mpn_add_n (c3, c3, v2, k);
- vinf[0] += cy;
- ASSERT(vinf[0] >= cy); /* No carry */
- MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
+ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
+ v0 t1 vinf tm1 t2
+ +tm1 +t2 */
#undef v0
+#undef t2
}
diff --git a/gmp/mpn/generic/toom_interpolate_6pts.c b/gmp/mpn/generic/toom_interpolate_6pts.c
deleted file mode 100644
index bdb2e95b89..0000000000
--- a/gmp/mpn/generic/toom_interpolate_6pts.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
-#ifndef mpn_divexact_by3
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
-#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
-#else
-#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
-#endif
-#endif
-
-/* Interpolation for Toom-3.5, using the evaluation points: infinity,
- 1, -1, 2, -2. More precisely, we want to compute
- f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
- six values
-
- w5 = f(0),
- w4 = f(-1),
- w3 = f(1)
- w2 = f(-2),
- w1 = f(2),
- w0 = limit at infinity of f(x) / x^5,
-
- The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
- {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
- {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
- significant limbs small). f(-1) and f(-2) may be negative, signs
- determined by the flag bits. All intermediate results are positive.
- Inputs are destroyed.
-
- Interpolation sequence was taken from the paper: "Integer and
- Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
- Some slight variations were introduced: adaptation to "gmp
- instruction set", and a final saving of an operation by interlacing
- interpolation and recomposition phases.
-*/
-
-void
-mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
- mp_ptr w4, mp_ptr w2, mp_ptr w1,
- mp_size_t w0n)
-{
- mp_limb_t cy;
- /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
- mp_limb_t cy4, cy6, embankment;
-
- ASSERT( n > 0 );
- ASSERT( 2*n >= w0n && w0n > 0 );
-
-#define w5 pp /* 2n */
-#define w3 (pp + 2 * n) /* 2n+1 */
-#define w0 (pp + 5 * n) /* w0n */
-
- /* Interpolate with sequence:
- W2 =(W1 - W2)>>2
- W1 =(W1 - W5)>>1
- W1 =(W1 - W2)>>1
- W4 =(W3 - W4)>>1
- W2 =(W2 - W4)/3
- W3 = W3 - W4 - W5
- W1 =(W1 - W3)/3
- // Last steps are mixed with recomposition...
- W2 = W2 - W0<<2
- W4 = W4 - W2
- W3 = W3 - W1
- W2 = W2 - W0
- */
-
- /* W2 =(W1 - W2)>>2 */
- if (flags & toom6_vm2_neg)
- mpn_add_n (w2, w1, w2, 2 * n + 1);
- else
- mpn_sub_n (w2, w1, w2, 2 * n + 1);
- mpn_rshift (w2, w2, 2 * n + 1, 2);
-
- /* W1 =(W1 - W5)>>1 */
- w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
- mpn_rshift (w1, w1, 2 * n + 1, 1);
-
- /* W1 =(W1 - W2)>>1 */
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
-#else
- mpn_sub_n (w1, w1, w2, 2 * n + 1);
- mpn_rshift (w1, w1, 2 * n + 1, 1);
-#endif
-
- /* W4 =(W3 - W4)>>1 */
- if (flags & toom6_vm1_neg)
- {
-#if HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
-#else
- mpn_add_n (w4, w3, w4, 2 * n + 1);
- mpn_rshift (w4, w4, 2 * n + 1, 1);
-#endif
- }
- else
- {
-#if HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
-#else
- mpn_sub_n (w4, w3, w4, 2 * n + 1);
- mpn_rshift (w4, w4, 2 * n + 1, 1);
-#endif
- }
-
- /* W2 =(W2 - W4)/3 */
- mpn_sub_n (w2, w2, w4, 2 * n + 1);
- mpn_divexact_by3 (w2, w2, 2 * n + 1);
-
- /* W3 = W3 - W4 - W5 */
- mpn_sub_n (w3, w3, w4, 2 * n + 1);
- w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
-
- /* W1 =(W1 - W3)/3 */
- mpn_sub_n (w1, w1, w3, 2 * n + 1);
- mpn_divexact_by3 (w1, w1, 2 * n + 1);
-
- /*
- [1 0 0 0 0 0;
- 0 1 0 0 0 0;
- 1 0 1 0 0 0;
- 0 1 0 1 0 0;
- 1 0 1 0 1 0;
- 0 0 0 0 0 1]
-
- pp[] prior to operations:
- |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
-
- summation scheme for remaining operations:
- |______________5|n_____4|n_____3|n_____2|n______|n______|pp
- |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
- || H w4 | L w4 |
- || H w2 | L w2 |
- || H w1 | L w1 |
- ||-H w1 |-L w1 |
- |-H w0 |-L w0 ||-H w2 |-L w2 |
- */
- cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
- MPN_INCR_U (pp + 3 * n + 1, n, cy);
-
- /* W2 -= W0<<2 */
-#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
-#if HAVE_NATIVE_mpn_sublsh2_n_ip1
- cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
-#else
- cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
-#endif
-#else
- /* {W4,2*n+1} is now free and can be overwritten. */
- cy = mpn_lshift(w4, w0, w0n, 2);
- cy+= mpn_sub_n(w2, w2, w4, w0n);
-#endif
- MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
-
- /* W4L = W4L - W2L */
- cy = mpn_sub_n (pp + n, pp + n, w2, n);
- MPN_DECR_U (w3, 2 * n + 1, cy);
-
- /* W3H = W3H + W2L */
- cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
- /* W1L + W2H */
- cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
- MPN_INCR_U (w1 + n, n + 1, cy);
-
- /* W0 = W0 + W1H */
- if (LIKELY (w0n > n))
- cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
- else
- cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
-
- /*
- summation scheme for the next operation:
- |...____5|n_____4|n_____3|n_____2|n______|n______|pp
- |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
- ...-w0___|-w1_w2 |
- */
- /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
- cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
-
- /* embankment is a "dirty trick" to avoid carry/borrow propagation
- beyond allocated memory */
- embankment = w0[w0n - 1] - 1;
- w0[w0n - 1] = 1;
- if (LIKELY (w0n > n)) {
- if (cy4 > cy6)
- MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
- else
- MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
- MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
- MPN_INCR_U (w0 + n, w0n - n, cy6);
- } else {
- MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
- MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
- }
- w0[w0n - 1] += embankment;
-
-#undef w5
-#undef w3
-#undef w0
-
-}
diff --git a/gmp/mpn/generic/toom_interpolate_7pts.c b/gmp/mpn/generic/toom_interpolate_7pts.c
index 2a67dba82f..872da26309 100644
--- a/gmp/mpn/generic/toom_interpolate_7pts.c
+++ b/gmp/mpn/generic/toom_interpolate_7pts.c
@@ -1,7 +1,6 @@
/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
- Contributed to the GNU project by Niels Möller.
- Improvements by Marco Bodrato.
+ Contributed to the GNU project by Niels Möller.
THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
@@ -12,216 +11,149 @@ Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
-#define BINVERT_3 MODLIMB_INVERSE_3
-
-#define BINVERT_9 \
- ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
-
-#define BINVERT_15 \
- ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
-
-/* For the various mpn_divexact_byN here, fall back to using either
- mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is
- many faster if it is native. For now, since mpn_divexact_1 is native on
- several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
- mpn_pi1_bdiv_q_1 unconditionally. FIXME. */
+/* Arithmetic right shift, requiring that the shifted out bits are zero. */
+static inline void
+divexact_2exp (mp_ptr rp, mp_srcptr sp, mp_size_t n, unsigned shift)
+{
+ mp_limb_t sign;
+ sign = LIMB_HIGHBIT_TO_MASK (sp[n-1] << GMP_NAIL_BITS) << (GMP_NUMB_BITS - shift);
+ ASSERT_NOCARRY (mpn_rshift (rp, sp, n, shift));
+ rp[n-1] |= sign & GMP_NUMB_MASK;
+}
/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
#ifndef mpn_divexact_by3
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
-#else
#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
#endif
-#endif
-
#ifndef mpn_divexact_by9
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
-#else
#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
#endif
-#endif
-
#ifndef mpn_divexact_by15
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
-#else
#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
#endif
-#endif
-/* Interpolation for toom4, using the evaluation points 0, infinity,
- 1, -1, 2, -2, 1/2. More precisely, we want to compute
+/* Interpolation for toom4, using the evaluation points infinity, 2,
+ 1, -1, 1/2, -1/2. More precisely, we want to compute
f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
seven values
w0 = f(0),
- w1 = f(-2),
- w2 = f(1),
+ w1 = 64 f(-1/2),
+ w2 = 64 f(1/2),
w3 = f(-1),
- w4 = f(2)
- w5 = 64 * f(1/2)
+ w4 = f(1)
+ w5 = f(2)
w6 = limit at infinity of f(x) / x^6,
The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
w6n }. The other values are 2n + 1 limbs each (with most
significant limbs small). f(-1) and f(-1/2) may be negative, signs
- determined by the flag bits. Inputs are destroyed.
+ determined by the flag bits. All intermediate results are
+ represented in two's complement. Inputs are destroyed.
Needs (2*n + 1) limbs of temporary storage.
*/
void
-mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
+mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom4_flags flags,
mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
mp_size_t w6n, mp_ptr tp)
{
- mp_size_t m;
+ mp_size_t m = 2*n + 1;
+ mp_ptr w2 = rp + 2*n;
+ mp_ptr w6 = rp + 6*n;
mp_limb_t cy;
- m = 2*n + 1;
-#define w0 rp
-#define w2 (rp + 2*n)
-#define w6 (rp + 6*n)
-
ASSERT (w6n > 0);
ASSERT (w6n <= 2*n);
- /* Using formulas similar to Marco Bodrato's
+ /* Using Marco Bodrato's formulas
- W5 = W5 + W4
- W1 =(W4 - W1)/2
- W4 = W4 - W0
- W4 =(W4 - W1)/4 - W6*16
- W3 =(W2 - W3)/2
- W2 = W2 - W3
+ W5 = W5 + W2
+ W3 =(W3 + W4)/2
+ W1 = W1 + W2
+ W2 = W2 - W6 - W0*64
+ W2 =(W2*2 - W1)/8
+ W4 = W4 - W3
- W5 = W5 - W2*65 May be negative.
- W2 = W2 - W6 - W0
- W5 =(W5 + W2*45)/2 Now >= 0 again.
- W4 =(W4 - W2)/3
- W2 = W2 - W4
+ W5 = W5 - W4*65
+ W4 = W4 - W6 - W0
+ W5 = W5 + W4*45
+ W2 =(W2 - W4)/3
+ W4 = W4 - W2
- W1 = W5 - W1 May be negative.
- W5 =(W5 - W3*8)/9
+ W1 = W1 - W5
+ W5 =(W5 - W3*16)/ 18
W3 = W3 - W5
- W1 =(W1/15 + W5)/2 Now >= 0 again.
+ W1 =(W1/30 + W5)/ 2
W5 = W5 - W1
- where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
- W4 = f(2), W5 = f(1/2), W6 = f(oo),
-
- Note that most intermediate results are positive; the ones that
- may be negative are represented in two's complement. We must
- never shift right a value that may be negative, since that would
- invalidate the sign bit. On the other hand, divexact by odd
- numbers work fine with two's complement.
+ where W0 = f(0), W1 = 64 f(-1/2), W2 = 64 f(1/2), W3 = f(-1),
+ W4 = f(1), W5 = f(2), W6 = f(oo),
*/
- mpn_add_n (w5, w5, w4, m);
- if (flags & toom7_w1_neg)
- {
-#ifdef HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (w1, w1, w4, m);
-#else
- mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1));
- mpn_rshift (w1, w1, m, 1);
-#endif
- }
+ mpn_add_n (w5, w5, w2, m);
+ if (flags & toom4_w3_neg)
+ mpn_add_n (w3, w3, w4, m);
else
- {
-#ifdef HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (w1, w4, w1, m);
-#else
- mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1));
- mpn_rshift (w1, w1, m, 1);
-#endif
- }
- mpn_sub (w4, w4, m, w0, 2*n);
- mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3));
- mpn_rshift (w4, w4, m, 2); /* w4>=0 */
-
- tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
- mpn_sub (w4, w4, m, tp, w6n+1);
-
- if (flags & toom7_w3_neg)
- {
-#ifdef HAVE_NATIVE_mpn_rsh1add_n
- mpn_rsh1add_n (w3, w3, w2, m);
-#else
- mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1));
- mpn_rshift (w3, w3, m, 1);
-#endif
- }
+ mpn_sub_n (w3, w4, w3, m);
+ divexact_2exp (w3, w3, m, 1);
+ if (flags & toom4_w1_neg)
+ mpn_add_n (w1, w1, w2, m);
else
- {
-#ifdef HAVE_NATIVE_mpn_rsh1sub_n
- mpn_rsh1sub_n (w3, w2, w3, m);
-#else
- mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1));
- mpn_rshift (w3, w3, m, 1);
-#endif
- }
-
- mpn_sub_n (w2, w2, w3, m);
-
- mpn_submul_1 (w5, w2, m, 65);
+ mpn_sub_n (w1, w2, w1, m);
mpn_sub (w2, w2, m, w6, w6n);
- mpn_sub (w2, w2, m, w0, 2*n);
-
- mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1));
- mpn_rshift (w5, w5, m, 1);
- mpn_sub_n (w4, w4, w2, m);
-
- mpn_divexact_by3 (w4, w4, m);
+ tp[2*n] = mpn_lshift (tp, rp, 2*n, 6);
+ mpn_sub_n (w2, w2, tp, m);
+ mpn_lshift (w2, w2, m, 1);
+ mpn_sub_n (w2, w2, w1, m);
+ divexact_2exp (w2, w2, m, 3);
+ mpn_sub_n (w4, w4, w3, m);
+
+ mpn_submul_1 (w5, w4, m, 65);
+ mpn_sub (w4, w4, m, w6, w6n);
+ mpn_sub (w4, w4, m, rp, 2*n);
+ mpn_addmul_1 (w5, w4, m, 45);
mpn_sub_n (w2, w2, w4, m);
+ /* Rely on divexact working with two's complement */
+ mpn_divexact_by3 (w2, w2, m);
+ mpn_sub_n (w4, w4, w2, m);
- mpn_sub_n (w1, w5, w1, m);
- mpn_lshift (tp, w3, m, 3);
+ mpn_sub_n (w1, w1, w5, m);
+ mpn_lshift (tp, w3, m, 4);
mpn_sub_n (w5, w5, tp, m);
+ divexact_2exp (w5, w5, m, 1);
mpn_divexact_by9 (w5, w5, m);
mpn_sub_n (w3, w3, w5, m);
-
+ divexact_2exp (w1, w1, m, 1);
mpn_divexact_by15 (w1, w1, m);
- mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1));
- mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
+ mpn_add_n (w1, w1, w5, m);
+ divexact_2exp (w1, w1, m, 1);
mpn_sub_n (w5, w5, w1, m);
- /* These bounds are valid for the 4x4 polynomial product of toom44,
- * and they are conservative for toom53 and toom62. */
- ASSERT (w1[2*n] < 2);
- ASSERT (w2[2*n] < 3);
- ASSERT (w3[2*n] < 4);
- ASSERT (w4[2*n] < 3);
- ASSERT (w5[2*n] < 2);
+ /* Two's complement coefficients must be non-negative at the end of
+ this procedure. */
+ ASSERT ( !(w1[2*n] & GMP_LIMB_HIGHBIT));
+ ASSERT ( !(w2[2*n] & GMP_LIMB_HIGHBIT));
+ ASSERT ( !(w3[2*n] & GMP_LIMB_HIGHBIT));
+ ASSERT ( !(w4[2*n] & GMP_LIMB_HIGHBIT));
+ ASSERT ( !(w5[2*n] & GMP_LIMB_HIGHBIT));
/* Addition chain. Note carries and the 2n'th limbs that need to be
* added in.
@@ -242,8 +174,8 @@ mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
* c7 c6 c5 c4 c3 Carries to propagate
*/
- cy = mpn_add_n (rp + n, rp + n, w1, m);
- MPN_INCR_U (w2 + n + 1, n , cy);
+ cy = mpn_add_n (rp + n, rp + n, w1, 2*n);
+ MPN_INCR_U (w2 + n, n + 1, w1[2*n] + cy);
cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
@@ -251,7 +183,10 @@ mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
if (w6n > n + 1)
- ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1));
+ {
+ mp_limb_t c7 = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
+ MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, c7);
+ }
else
{
ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
diff --git a/gmp/mpn/generic/toom_interpolate_8pts.c b/gmp/mpn/generic/toom_interpolate_8pts.c
deleted file mode 100644
index 9e8808334e..0000000000
--- a/gmp/mpn/generic/toom_interpolate_8pts.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.
-
- Contributed to the GNU project by Marco Bodrato.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2011, 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-#define BINVERT_3 MODLIMB_INVERSE_3
-
-#define BINVERT_15 \
- ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
-
-#define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)
-
-#ifndef mpn_divexact_by3
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
-#else
-#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
-#endif
-#endif
-
-#ifndef mpn_divexact_by45
-#if GMP_NUMB_BITS % 12 == 0
-#define mpn_divexact_by45(dst,src,size) \
- (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
-#else
-#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
-#define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
-#else
-#define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
-#endif
-#endif
-#endif
-
-#if HAVE_NATIVE_mpn_sublsh2_n_ip1
-#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
-#else
-#define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
-#endif
-
-#if HAVE_NATIVE_mpn_sublsh_n
-#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s)
-#else
-static mp_limb_t
-DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
-{
-#if USE_MUL_1 && 0
- return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
-#else
- mp_limb_t __cy;
- __cy = mpn_lshift (ws,src,n,s);
- return __cy + mpn_sub_n (dst,dst,ws,n);
-#endif
-}
-#endif
-
-
-#if HAVE_NATIVE_mpn_subrsh
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
-#else
-/* This is not a correct definition, it assumes no carry */
-#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
-do { \
- mp_limb_t __cy; \
- MPN_DECR_U (dst, nd, src[0] >> s); \
- __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
- MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
-} while (0)
-#endif
-
-/* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
- points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
- we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
- degree 7 (or 6), given the 8 (rsp. 7) values:
-
- r1 = limit at infinity of f(x) / x^7,
- r2 = f(4),
- r3 = f(-4),
- r4 = f(2),
- r5 = f(-2),
- r6 = f(1),
- r7 = f(-1),
- r8 = f(0).
-
- All couples of the form f(n),f(-n) must be already mixed with
- toom_couple_handling(f(n),...,f(-n),...)
-
- The result is stored in {pp, spt + 7*n (or 6*n)}.
- At entry, r8 is stored at {pp, 2n},
- r5 is stored at {pp + 3n, 3n + 1}.
-
- The other values are 2n+... limbs each (with most significant limbs small).
-
- All intermediate results are positive.
- Inputs are destroyed.
-*/
-
-void
-mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
- mp_ptr r3, mp_ptr r7,
- mp_size_t spt, mp_ptr ws)
-{
- mp_limb_signed_t cy;
- mp_ptr r5, r1;
- r5 = (pp + 3 * n); /* 3n+1 */
- r1 = (pp + 7 * n); /* spt */
-
- /******************************* interpolation *****************************/
-
- DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
- cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
- MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);
-
- DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
- cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
- MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);
-
- r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
- cy = mpn_sub_n (r7, r7, r1, spt);
- MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);
-
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
- ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));
-
- ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));
-
- ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
-
- mpn_divexact_by45 (r3, r3, 3 * n + 1);
-
- ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));
-
- ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));
-
- /* last interpolation steps... */
- /* ... are mixed with recomposition */
-
- /***************************** recomposition *******************************/
- /*
- pp[] prior to operations:
- |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp
-
- summation scheme for remaining operations:
- |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
- |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
- ||_H r3|_M r3|_L*r3|
- ||_H_r7|_M_r7|_L_r7|
- ||-H r3|-M r3|-L*r3|
- ||-H*r5|-M_r5|-L_r5|
- */
-
- cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
- cy-= mpn_sub_n (pp + n, pp + n, r5, n);
- if (0 > cy)
- MPN_DECR_U (r7 + n, 2*n + 1, 1);
- else
- MPN_INCR_U (r7 + n, 2*n + 1, cy);
-
- cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */
- MPN_DECR_U (r7 + 2*n, n + 1, cy);
-
- cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
- r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
- cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
- if (UNLIKELY(0 > cy))
- MPN_DECR_U (r5 + n + 1, 2*n, 1);
- else
- MPN_INCR_U (r5 + n + 1, 2*n, cy);
-
- ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */
-
- cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
- MPN_INCR_U (r3 + 2*n, n + 1, cy);
- cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
- if (LIKELY(spt != n))
- MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
- else
- ASSERT (r3[3*n] | cy == 0);
-}
diff --git a/gmp/mpn/generic/trialdiv.c b/gmp/mpn/generic/trialdiv.c
deleted file mode 100644
index cad159c3a0..0000000000
--- a/gmp/mpn/generic/trialdiv.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/* mpn_trialdiv -- find small factors of an mpn number using trial division.
-
- Contributed to the GNU project by Torbjorn Granlund.
-
- THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
- SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/*
- This function finds the first (smallest) factor represented in
- trialdivtab.h. It does not stop the factoring effort just because it has
- reached some sensible limit, such as the square root of the input number.
-
- The caller can limit the factoring effort by passing NPRIMES. The function
- will then divide until that limit, or perhaps a few primes more. A position
- which only mpn_trialdiv can make sense of is returned in the WHERE
- parameter. It can be used for restarting the factoring effort; the first
- call should pass 0 here.
-
- Input: 1. A non-negative number T = {tp,tn}
- 2. NPRIMES as described above,
- 3. *WHERE as described above.
- Output: 1. *WHERE updated as described above.
- 2. Return value is non-zero if we found a factor, else zero
- To get the actual prime factor, compute the mod B inverse
- of the return value.
-*/
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-struct gmp_primes_dtab {
- mp_limb_t binv;
- mp_limb_t lim;
-};
-
-struct gmp_primes_ptab {
- mp_limb_t ppp; /* primes, multiplied together */
- mp_limb_t cps[7]; /* ppp values pre-computed for mpn_mod_1s_4p */
- unsigned int idx:24; /* index of first primes in dtab */
- unsigned int np :8; /* number of primes related to this entry */
-};
-
-
-static const struct gmp_primes_dtab gmp_primes_dtab[] =
-{
-#define WANT_dtab
-#define P(p,inv,lim) {inv,lim}
-#include "trialdivtab.h"
-#undef WANT_dtab
-#undef P
- {0,0}
-};
-
-static const struct gmp_primes_ptab gmp_primes_ptab[] =
-{
-#define WANT_ptab
-#include "trialdivtab.h"
-#undef WANT_ptab
-};
-
-#define PTAB_LINES (sizeof (gmp_primes_ptab) / sizeof (gmp_primes_ptab[0]))
-
-/* FIXME: We could optimize out one of the outer loop conditions if we
- had a final ptab entry with a huge nd field. */
-mp_limb_t
-mpn_trialdiv (mp_srcptr tp, mp_size_t tn, mp_size_t nprimes, int *where)
-{
- mp_limb_t ppp;
- const mp_limb_t *cps;
- const struct gmp_primes_dtab *dp;
- long i, j, idx, np;
- mp_limb_t r, q;
-
- ASSERT (tn >= 1);
-
- for (i = *where; i < PTAB_LINES; i++)
- {
- ppp = gmp_primes_ptab[i].ppp;
- cps = gmp_primes_ptab[i].cps;
-
- r = mpn_mod_1s_4p (tp, tn, ppp << cps[1], cps);
-
- idx = gmp_primes_ptab[i].idx;
- np = gmp_primes_ptab[i].np;
-
- /* Check divisibility by individual primes. */
- dp = &gmp_primes_dtab[idx] + np;
- for (j = -np; j < 0; j++)
- {
- q = r * dp[j].binv;
- if (q <= dp[j].lim)
- {
- *where = i;
- return dp[j].binv;
- }
- }
-
- nprimes -= np;
- if (nprimes <= 0)
- return 0;
- }
- return 0;
-}
diff --git a/gmp/mpn/generic/udiv_w_sdiv.c b/gmp/mpn/generic/udiv_w_sdiv.c
index 7136429f0f..f876aa5734 100644
--- a/gmp/mpn/generic/udiv_w_sdiv.c
+++ b/gmp/mpn/generic/udiv_w_sdiv.c
@@ -9,40 +9,30 @@
GNU MP RELEASE.
-Copyright 1992, 1994, 1996, 2000, 2011, 2012 Free Software Foundation, Inc.
+Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
mp_limb_t
-mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d)
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+ mp_limb_t *rp, a1, a0, d;
{
mp_limb_t q, r;
mp_limb_t c0, c1, b1;
@@ -52,7 +42,7 @@ mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d)
if ((mp_limb_signed_t) d >= 0)
{
- if (a1 < d - a1 - (a0 >> (GMP_LIMB_BITS - 1)))
+ if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
{
/* dividend, divisor, and quotient are nonnegative */
sdiv_qrnnd (q, r, a1, a0, d);
@@ -60,18 +50,18 @@ mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d)
else
{
/* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
- sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (GMP_LIMB_BITS - 1));
+ sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
/* Divide (c1*2^32 + c0) by d */
sdiv_qrnnd (q, r, c1, c0, d);
/* Add 2^31 to quotient */
- q += (mp_limb_t) 1 << (GMP_LIMB_BITS - 1);
+ q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
}
}
else
{
b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */
c1 = a1 >> 1; /* A/2 */
- c0 = (a1 << (GMP_LIMB_BITS - 1)) + (a0 >> 1);
+ c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */
{
@@ -126,12 +116,12 @@ mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d)
{ /* Hence a1 = d - 1 = 2*b1 - 1 */
if (a0 >= -d)
{
- q = -CNST_LIMB(1);
+ q = -1;
r = a0 + d;
}
else
{
- q = -CNST_LIMB(2);
+ q = -2;
r = a0 + 2*d;
}
}
diff --git a/gmp/mpn/generic/zero.c b/gmp/mpn/generic/zero.c
deleted file mode 100644
index e6e7fd3101..0000000000
--- a/gmp/mpn/generic/zero.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* mpn_zero
-
-Copyright 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-void
-mpn_zero (mp_ptr rp, mp_size_t n)
-{
- mp_size_t i;
-
- rp += n;
- for (i = -n; i != 0; i++)
- rp[i] = 0;
-}
diff --git a/gmp/mpn/i960/README b/gmp/mpn/i960/README
new file mode 100644
index 0000000000..d68a0a83eb
--- /dev/null
+++ b/gmp/mpn/i960/README
@@ -0,0 +1,9 @@
+This directory contains mpn functions for Intel i960 processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The code in this directory is not well optimized.
+
+STATUS
+
+The code in this directory has not been tested.
diff --git a/gmp/mpn/i960/add_n.s b/gmp/mpn/i960/add_n.s
new file mode 100644
index 0000000000..24abc6b0c9
--- /dev/null
+++ b/gmp/mpn/i960/add_n.s
@@ -0,0 +1,41 @@
+# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 4
+ .globl ___gmpn_add_n
+___gmpn_add_n:
+ mov 0,g6 # clear carry-save register
+ cmpo 1,0 # clear cy
+
+Loop: subo 1,g3,g3 # update loop counter
+ ld (g1),g5 # load from s1_ptr
+ addo 4,g1,g1 # s1_ptr++
+ ld (g2),g4 # load from s2_ptr
+ addo 4,g2,g2 # s2_ptr++
+ cmpo g6,1 # restore cy from g6, relies on cy being 0
+ addc g4,g5,g4 # main add
+ subc 0,0,g6 # save cy in g6
+ st g4,(g0) # store result to res_ptr
+ addo 4,g0,g0 # res_ptr++
+ cmpobne 0,g3,Loop # when branch is taken, clears C bit
+
+ mov g6,g0
+ ret
diff --git a/gmp/mpn/i960/addmul_1.s b/gmp/mpn/i960/addmul_1.s
new file mode 100644
index 0000000000..984f540f7b
--- /dev/null
+++ b/gmp/mpn/i960/addmul_1.s
@@ -0,0 +1,46 @@
+# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 4
+ .globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subo g2,0,g2
+ shlo 2,g2,g4
+ subo g4,g1,g1
+ subo g4,g0,g13
+ mov 0,g0
+
+ cmpo 1,0 # clear C bit on AC.cc
+
+Loop: ld (g1)[g2*4],g5
+ emul g3,g5,g6
+ ld (g13)[g2*4],g5
+
+ addc g0,g6,g6 # relies on that C bit is clear
+ addc 0,g7,g7
+ addc g5,g6,g6 # relies on that C bit is clear
+ st g6,(g13)[g2*4]
+ addc 0,g7,g0
+
+ addo g2,1,g2
+ cmpobne 0,g2,Loop # when branch is taken, clears C bit
+
+ ret
diff --git a/gmp/mpn/i960/mul_1.s b/gmp/mpn/i960/mul_1.s
new file mode 100644
index 0000000000..7912aa1fb7
--- /dev/null
+++ b/gmp/mpn/i960/mul_1.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 4
+ .globl ___gmpn_mul_1
+___gmpn_mul_1:
+ subo g2,0,g2
+ shlo 2,g2,g4
+ subo g4,g1,g1
+ subo g4,g0,g13
+ mov 0,g0
+
+ cmpo 1,0 # clear C bit on AC.cc
+
+Loop: ld (g1)[g2*4],g5
+ emul g3,g5,g6
+
+ addc g0,g6,g6 # relies on that C bit is clear
+ st g6,(g13)[g2*4]
+ addc 0,g7,g0
+
+ addo g2,1,g2
+ cmpobne 0,g2,Loop # when branch is taken, clears C bit
+
+ ret
diff --git a/gmp/mpn/i960/sub_n.s b/gmp/mpn/i960/sub_n.s
new file mode 100644
index 0000000000..87adcbf6a2
--- /dev/null
+++ b/gmp/mpn/i960/sub_n.s
@@ -0,0 +1,41 @@
+# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 4
+ .globl ___gmpn_sub_n
+___gmpn_sub_n:
+ mov 1,g6 # set carry-save register
+ cmpo 1,0 # clear cy
+
+Loop: subo 1,g3,g3 # update loop counter
+ ld (g1),g5 # load from s1_ptr
+ addo 4,g1,g1 # s1_ptr++
+ ld (g2),g4 # load from s2_ptr
+ addo 4,g2,g2 # s2_ptr++
+ cmpo g6,1 # restore cy from g6, relies on cy being 0
+ subc g4,g5,g4 # main subtract
+ subc 0,0,g6 # save cy in g6
+ st g4,(g0) # store result to res_ptr
+ addo 4,g0,g0 # res_ptr++
+ cmpobne 0,g3,Loop # when branch is taken, cy will be 0
+
+ mov g6,g0
+ ret
diff --git a/gmp/mpn/ia64/README b/gmp/mpn/ia64/README
index 45c2d6337f..9252271ab7 100644
--- a/gmp/mpn/ia64/README
+++ b/gmp/mpn/ia64/README
@@ -1,30 +1,19 @@
-Copyright 2000-2005 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -166,7 +155,7 @@ ldfp8 with all alignment headache that implies.
================================================================
mpn_addmul_N
-For best speed, we need to give up using mpn_addmul_2 as the main multiply
+For best speed, we need to give up using mpn_addmul_1 as the main multiply
building block, and instead take multiple v limbs per loop. For the Itanium
1, we need to take about 8 limbs at a time for full speed. For the Itanium
2, something like mpn_addmul_4 should be enough.
@@ -248,7 +237,7 @@ loops, since that will allow us to do better load-use scheduling without too
much unrolling.
Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium
-2, according to tune/speed. Cache bank conflicts?
+2, according to tests/devel/try. Cache bank conflicts?
@@ -279,3 +268,10 @@ Optimization, Intel document 251110-003, May 2004.
All the above documents can be found online at
http://developer.intel.com/design/itanium/manuals.htm
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/gmp/mpn/ia64/add_n_sub_n.asm b/gmp/mpn/ia64/add_n_sub_n.asm
deleted file mode 100644
index 34a506568f..0000000000
--- a/gmp/mpn/ia64/add_n_sub_n.asm
+++ /dev/null
@@ -1,309 +0,0 @@
-dnl IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 2.25
-
-C INPUT PARAMETERS
-define(`sp', `r32')
-define(`dp', `r33')
-define(`up', `r34')
-define(`vp', `r35')
-define(`n', `r36')
-
-C Some useful aliases for registers we use
-define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
-define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
-define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
-define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
-define(`up0',`up')
-define(`up1',`r14')
-define(`vp0',`vp')
-define(`vp1',`r15')
-
-define(`cmpltu', `cmp.ltu')
-define(`cmpeqor', `cmp.eq.or')
-
-ASM_START()
-PROLOGUE(mpn_add_n_sub_n)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
- addp4 sp = 0, sp C M I
- addp4 dp = 0, dp C M I
- nop.i 0
- addp4 up = 0, up C M I
- addp4 vp = 0, vp C M I
- zxt4 n = n C I
- ;;
-')
-
- and r9 = 3, n C M I
- mov.i r2 = ar.lc C I0
- add up1 = 8, up0 C M I
- add vp1 = 8, vp0 C M I
- add r8 = -2, n C M I
- add r10 = 256, up C M I
- ;;
- shr.u r8 = r8, 2 C I0
- cmp.eq p10, p0 = 0, r9 C M I
- cmp.eq p11, p0 = 2, r9 C M I
- cmp.eq p12, p0 = 3, r9 C M I
- add r11 = 256, vp C M I
- ;;
- mov.i ar.lc = r8 C I0
- (p10) br L(b0) C B
- (p11) br L(b2) C B
- (p12) br L(b3) C B
-
-L(b1): ld8 u3 = [up0], 8 C M01
- add up1 = 8, up1 C M I
- cmpltu p14, p15 = 4, n C M I
- ld8 v3 = [vp0], 8 C M01
- add vp1 = 8, vp1 C M I
- ;;
- add s3 = u3, v3 C M I
- sub d3 = u3, v3 C M I
- mov r8 = 0 C M I
- ;;
- cmpltu p9, p0 = s3, v3 C carry from add3 M I
- cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
- (p15) br L(cj1) C B
- st8 [sp] = s3, 8 C M23
- st8 [dp] = d3, 8 C M23
- br L(c0) C B
-
-L(b0): cmp.ne p9, p0 = r0, r0 C M I
- cmp.ne p13, p0 = r0, r0 C M I
-L(c0): ld8 u0 = [up0], 16 C M01
- ld8 u1 = [up1], 16 C M01
- ;;
- ld8 v0 = [vp0], 16 C M01
- ld8 v1 = [vp1], 16 C M01
- ;;
- ld8 u2 = [up0], 16 C M01
- ld8 u3 = [up1], 16 C M01
- ;;
- ld8 v2 = [vp0], 16 C M01
- ld8 v3 = [vp1], 16 C M01
- ;;
- add s0 = u0, v0 C M I
- add s1 = u1, v1 C M I
- sub d0 = u0, v0 C M I
- sub d1 = u1, v1 C M I
- ;;
- cmpltu p6, p0 = s0, v0 C carry from add0 M I
- cmpltu p7, p0 = s1, v1 C carry from add1 M I
- cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
- cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
- ;;
- nop 0 C
- br.cloop.dptk L(top) C B
- br L(end) C B
-
-L(b3): ld8 u1 = [up0], 8 C M01
- add up1 = 8, up1 C M I
- ld8 v1 = [vp0], 8 C M01
- ;;
- add vp1 = 8, vp1 C M I
- add s1 = u1, v1 C M I
- sub d1 = u1, v1 C M I
- ;;
- cmpltu p7, p0 = s1, v1 C carry from add1 M I
- cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
- ;;
- st8 [sp] = s1, 8 C M23
- st8 [dp] = d1, 8 C M23
- br L(c2) C B
-
- ALIGN(32)
-L(b2): cmp.ne p7, p0 = r0, r0 C M I
- cmp.ne p11, p0 = r0, r0 C M I
- nop 0
-L(c2): ld8 u2 = [up0], 16 C M01
- ld8 u3 = [up1], 16 C M01
- cmpltu p14, p0 = 4, n C M I
- ;;
- ld8 v2 = [vp0], 16 C M01
- ld8 v3 = [vp1], 16 C M01
- (p14) br L(gt4) C B
- ;;
- add s2 = u2, v2 C M I
- add s3 = u3, v3 C M I
- sub d2 = u2, v2 C M I
- sub d3 = u3, v3 C M I
- ;;
- cmpltu p8, p0 = s2, v2 C carry from add0 M I
- cmpltu p9, p0 = s3, v3 C carry from add3 M I
- cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
- cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
- br L(cj2) C B
- ;;
-L(gt4): ld8 u0 = [up0], 16 C M01
- ld8 u1 = [up1], 16 C M01
- ;;
- ld8 v0 = [vp0], 16 C M01
- ld8 v1 = [vp1], 16 C M01
- ;;
- add s2 = u2, v2 C M I
- add s3 = u3, v3 C M I
- sub d2 = u2, v2 C M I
- sub d3 = u3, v3 C M I
- ;;
- cmpltu p8, p0 = s2, v2 C carry from add0 M I
- cmpltu p9, p0 = s3, v3 C carry from add1 M I
- cmpltu p12, p0 = u2, v2 C borrow from sub0 M I
- cmpltu p13, p0 = u3, v3 C borrow from sub1 M I
- br.cloop.dptk L(mid) C B
-
- ALIGN(32)
-L(top):
- ld8 u0 = [up0], 16 C M01
- ld8 u1 = [up1], 16 C M01
- (p9) cmpeqor p6, p0 = -1, s0 C M I
- (p9) add s0 = 1, s0 C M I
- (p13) cmpeqor p10, p0 = 0, d0 C M I
- (p13) add d0 = -1, d0 C M I
- ;;
- ld8 v0 = [vp0], 16 C M01
- ld8 v1 = [vp1], 16 C M01
- (p6) cmpeqor p7, p0 = -1, s1 C M I
- (p6) add s1 = 1, s1 C M I
- (p10) cmpeqor p11, p0 = 0, d1 C M I
- (p10) add d1 = -1, d1 C M I
- ;;
- st8 [sp] = s0, 8 C M23
- st8 [dp] = d0, 8 C M23
- add s2 = u2, v2 C M I
- add s3 = u3, v3 C M I
- sub d2 = u2, v2 C M I
- sub d3 = u3, v3 C M I
- ;;
- st8 [sp] = s1, 8 C M23
- st8 [dp] = d1, 8 C M23
- cmpltu p8, p0 = s2, v2 C carry from add2 M I
- cmpltu p9, p0 = s3, v3 C carry from add3 M I
- cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
- cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
- ;;
-L(mid):
- ld8 u2 = [up0], 16 C M01
- ld8 u3 = [up1], 16 C M01
- (p7) cmpeqor p8, p0 = -1, s2 C M I
- (p7) add s2 = 1, s2 C M I
- (p11) cmpeqor p12, p0 = 0, d2 C M I
- (p11) add d2 = -1, d2 C M I
- ;;
- ld8 v2 = [vp0], 16 C M01
- ld8 v3 = [vp1], 16 C M01
- (p8) cmpeqor p9, p0 = -1, s3 C M I
- (p8) add s3 = 1, s3 C M I
- (p12) cmpeqor p13, p0 = 0, d3 C M I
- (p12) add d3 = -1, d3 C M I
- ;;
- st8 [sp] = s2, 8 C M23
- st8 [dp] = d2, 8 C M23
- add s0 = u0, v0 C M I
- add s1 = u1, v1 C M I
- sub d0 = u0, v0 C M I
- sub d1 = u1, v1 C M I
- ;;
- st8 [sp] = s3, 8 C M23
- st8 [dp] = d3, 8 C M23
- cmpltu p6, p0 = s0, v0 C carry from add0 M I
- cmpltu p7, p0 = s1, v1 C carry from add1 M I
- cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
- cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
- ;;
- lfetch [r10], 32 C M?
- lfetch [r11], 32 C M?
- br.cloop.dptk L(top) C B
- ;;
-
-L(end):
- nop 0
- nop 0
- (p9) cmpeqor p6, p0 = -1, s0 C M I
- (p9) add s0 = 1, s0 C M I
- (p13) cmpeqor p10, p0 = 0, d0 C M I
- (p13) add d0 = -1, d0 C M I
- ;;
- nop 0
- nop 0
- (p6) cmpeqor p7, p0 = -1, s1 C M I
- (p6) add s1 = 1, s1 C M I
- (p10) cmpeqor p11, p0 = 0, d1 C M I
- (p10) add d1 = -1, d1 C M I
- ;;
- st8 [sp] = s0, 8 C M23
- st8 [dp] = d0, 8 C M23
- add s2 = u2, v2 C M I
- add s3 = u3, v3 C M I
- sub d2 = u2, v2 C M I
- sub d3 = u3, v3 C M I
- ;;
- st8 [sp] = s1, 8 C M23
- st8 [dp] = d1, 8 C M23
- cmpltu p8, p0 = s2, v2 C carry from add2 M I
- cmpltu p9, p0 = s3, v3 C carry from add3 M I
- cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
- cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
- ;;
-L(cj2):
- (p7) cmpeqor p8, p0 = -1, s2 C M I
- (p7) add s2 = 1, s2 C M I
- (p11) cmpeqor p12, p0 = 0, d2 C M I
- (p11) add d2 = -1, d2 C M I
- mov r8 = 0 C M I
- nop 0
- ;;
- st8 [sp] = s2, 8 C M23
- st8 [dp] = d2, 8 C M23
- (p8) cmpeqor p9, p0 = -1, s3 C M I
- (p8) add s3 = 1, s3 C M I
- (p12) cmpeqor p13, p0 = 0, d3 C M I
- (p12) add d3 = -1, d3 C M I
- ;;
-L(cj1):
- (p9) mov r8 = 2 C M I
- ;;
- mov.i ar.lc = r2 C I0
- (p13) add r8 = 1, r8 C M I
- st8 [sp] = s3 C M23
- st8 [dp] = d3 C M23
- br.ret.sptk.many b0 C B
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/ia64/addmul_1.asm b/gmp/mpn/ia64/addmul_1.asm
index ffa3297763..6cd9d2b755 100644
--- a/gmp/mpn/ia64/addmul_1.asm
+++ b/gmp/mpn/ia64/addmul_1.asm
@@ -1,35 +1,23 @@
dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/addmul_2.asm b/gmp/mpn/ia64/addmul_2.asm
index f5bc46b75d..2c258022ae 100644
--- a/gmp/mpn/ia64/addmul_2.asm
+++ b/gmp/mpn/ia64/addmul_2.asm
@@ -1,35 +1,22 @@
dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
dnl add the result to a (n+1)-limb number.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -37,11 +24,16 @@ C cycles/limb
C Itanium: 3.65
C Itanium 2: 1.625
+C Note that this is very similar to mul_2.asm. If you change this file,
+C please change that file too.
+
C TODO
C * Clean up variable names, and try to decrease the number of distinct
C registers used.
-C * Clean up feed-in code to not require zeroing several registers.
-C * Make sure we don't depend on uninitialised predicate registers.
+C * Cleanup feed-in code to not require zeroing several registers.
+C * Make sure we don't depend on uninitialized predicate registers.
+C * We currently cross-jump very aggressively, at the expense of a few cycles
+C per operation. Consider changing that.
C * Could perhaps save a few cycles by using 1 c/l carry propagation in
C wind-down code.
C * Ultimately rewrite. The problem with this code is that it first uses a
@@ -102,607 +94,564 @@ define(`ry',`f50')
define(`uy',`f51')
ASM_START()
-PROLOGUE(mpn_addmul_2s)
- .prologue
- .save ar.lc, r2
- .body
-
-ifdef(`HAVE_ABI_32',`
-.mmi; addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- addp4 vp = 0, vp C M I
-.mmi; nop 1
- nop 1
- zxt4 n = n C I
- ;;')
-
-.mmi; ldf8 ux = [up], 8 C M
- ldf8 v0 = [vp], 8 C M
- mov r2 = ar.lc C I0
-.mmi; ldf8 rx = [rp], 8 C M
- and r14 = 3, n C M I
- add n = -2, n C M I
- ;;
-.mmi; ldf8 uy = [up], 8 C M
- ldf8 v1 = [vp] C M
- shr.u n = n, 2 C I0
-.mmi; ldf8 ry = [rp], -8 C M
- cmp.eq p14, p0 = 1, r14 C M I
- cmp.eq p11, p0 = 2, r14 C M I
- ;;
-.mmi; add srp = 16, rp C M I
- cmp.eq p15, p0 = 3, r14 C M I
- mov ar.lc = n C I0
-.bbb; (p14) br.dptk L(x01) C B
- (p11) br.dptk L(x10) C B
- (p15) br.dptk L(x11) C B
- ;;
-
-L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair
- mov fp2a_3 = f0
- br L(b00)
-L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair
- mov fp2a_2 = f0
- br L(b01)
-L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair
- mov fp2a_1 = f0
- br L(b10)
-L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair
- mov fp2a_0 = f0
- br L(b11)
-
-EPILOGUE()
-
PROLOGUE(mpn_addmul_2)
.prologue
.save ar.lc, r2
.body
-ifdef(`HAVE_ABI_32',`
-.mmi; addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- addp4 vp = 0, vp C M I
-.mmi; nop 1
- nop 1
- zxt4 n = n C I
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
;;')
-.mmi; ldf8 ux = [up], 8 C M
- ldf8 v0 = [vp], 8 C M
- mov r2 = ar.lc C I0
-.mmi; ldf8 rx = [rp], 8 C M
- and r14 = 3, n C M I
- add n = -2, n C M I
- ;;
-.mmi; ldf8 uy = [up], 8 C M
- ldf8 v1 = [vp] C M
- shr.u n = n, 2 C I0
-.mmi; ldf8 ry = [rp], -8 C M
- cmp.eq p14, p0 = 1, r14 C M I
- cmp.eq p11, p0 = 2, r14 C M I
- ;;
-.mmi; add srp = 16, rp C M I
- cmp.eq p15, p6 = 3, r14 C M I
- mov ar.lc = n C I0
-.bbb; (p14) br.dptk L(b01) C B
- (p11) br.dptk L(b10) C B
- (p15) br.dptk L(b11) C B
- ;;
+{.mmi C 00
+ ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov.i r2 = ar.lc C I0
+}{.mmi
+ ldf8 rx = [rp], 8 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+}{.mmi C 01
+ ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+}{.mmi
+ ldf8 ry = [rp], -8 C M
+ cmp.eq p10, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+}{.mmi C 02
+ add srp = 16, rp C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ mov.i ar.lc = n C I0
+}{.bbb
+ (p10) br.dptk .Lb01 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
ALIGN(32)
-L(b00):
-.mmi; ldf8 r_1 = [srp], 8
- ldf8 u_1 = [up], 8
- mov acc1_2 = 0
-.mmi; mov pr1_2 = 0
- mov pr0_3 = 0
- cmp.ne p8, p9 = r0, r0
- ;;
-.mfi; ldf8 r_2 = [srp], 8
- xma.l fp0b_3 = ux, v0, rx
- cmp.ne p12, p13 = r0, r0
-.mfb; ldf8 u_2 = [up], 8
- xma.hu fp1b_3 = ux, v0, rx
- br.cloop.dptk L(gt4)
-
- xma.l fp0b_0 = uy, v0, ry
- xma.hu fp1a_0 = uy, v0, ry
- ;;
- getfsig acc0 = fp0b_3
- (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
- (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
- ;;
- xma.l fp0b_1 = u_1, v0, r_1
- xma.hu fp1a_1 = u_1, v0, r_1
- ;;
- getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
- ;;
- getfsig pr1_3 = fp1b_3
- getfsig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, r_2
- xma.hu fp1a_2 = u_2, v0, r_2
- br L(cj4)
-
-L(gt4): xma.l fp0b_0 = uy, v0, ry
- xma.hu fp1a_0 = uy, v0, ry
- ;;
- ldf8 r_3 = [srp], 8
- getfsig acc0 = fp0b_3
- (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
- ldf8 u_3 = [up], 8
- (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
- ;;
- xma.l fp0b_1 = u_1, v0, r_1
- xma.hu fp1a_1 = u_1, v0, r_1
- ;;
- ldf8 r_0 = [srp], 8
- getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
- ;;
- ldf8 u_0 = [up], 8
- getfsig pr1_3 = fp1b_3
- xma.l fp0b_2 = u_2, v0, r_2
- ;;
- getfsig acc1_3 = fp2a_3
- xma.hu fp1a_2 = u_2, v0, r_2
- br L(00)
+.Lb00: ldf8 r_1 = [srp], 8
+ ldf8 u_1 = [up], 8
+ mov acc1_2 = 0
+ mov pr1_2 = 0
+ mov pr0_3 = 0
+ cmp.ne p8, p9 = r0, r0
+ ;;
+ ldf8 r_2 = [srp], 8
+ xma.l fp0b_3 = ux, v0, rx
+ cmp.ne p12, p13 = r0, r0
+ ldf8 u_2 = [up], 8
+ xma.hu fp1a_3 = ux, v0, rx
+ br.cloop.dptk .grt4
+
+ xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
+ ;;
+ getf.sig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ xma.hu fp2a_3 = ux, v1, fp1a_3
+ ;;
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ getf.sig pr1_3 = fp1b_3
+ getf.sig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br .Lcj4
+
+.grt4: xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
+ ;;
+ ldf8 r_3 = [srp], 8
+ getf.sig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ ldf8 u_3 = [up], 8
+ xma.hu fp2a_3 = ux, v1, fp1a_3
+ ;;
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+ ldf8 r_0 = [srp], 8
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ ldf8 u_0 = [up], 8
+ getf.sig pr1_3 = fp1b_3
+ ;;
+ getf.sig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br .LL00
ALIGN(32)
-L(b01):
-.mmi; ldf8 r_0 = [srp], 8 C M
- ldf8 u_0 = [up], 8 C M
- mov acc1_1 = 0 C M I
-.mmi; mov pr1_1 = 0 C M I
- mov pr0_2 = 0 C M I
- cmp.ne p6, p7 = r0, r0 C M I
- ;;
-.mfi; ldf8 r_1 = [srp], 8 C M
- xma.l fp0b_2 = ux, v0, rx C F
- cmp.ne p10, p11 = r0, r0 C M I
-.mfi; ldf8 u_1 = [up], 8 C M
- xma.hu fp1b_2 = ux, v0, rx C F
- nop 1
- ;;
- xma.l fp0b_3 = uy, v0, ry C F
- xma.hu fp1a_3 = uy, v0, ry C F
- ;;
-.mmf; getfsig acc0 = fp0b_2 C M
- ldf8 r_2 = [srp], 8 C M
- (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
-.mfb; ldf8 u_2 = [up], 8 C M
- (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
- br.cloop.dptk L(gt5)
-
- xma.l fp0b_0 = u_0, v0, r_0 C F
- xma.hu fp1a_0 = u_0, v0, r_0 C F
- ;;
- getfsig pr0_3 = fp0b_3 C M
- xma.l fp1b_3 = uy, v1,fp1a_3 C F
- xma.hu fp2a_3 = uy, v1,fp1a_3 C F
- ;;
- getfsig pr1_2 = fp1b_2 C M
- getfsig acc1_2 = fp2a_2 C M
- xma.l fp0b_1 = u_1, v0, r_1 C F
- xma.hu fp1a_1 = u_1, v0, r_1 C F
- br L(cj5)
-
-L(gt5): xma.l fp0b_0 = u_0, v0, r_0
- xma.hu fp1a_0 = u_0, v0, r_0
- ;;
- getfsig pr0_3 = fp0b_3
- ldf8 r_3 = [srp], 8
- xma.l fp1b_3 = uy, v1, fp1a_3
- xma.hu fp2a_3 = uy, v1, fp1a_3
- ;;
- ldf8 u_3 = [up], 8
- getfsig pr1_2 = fp1b_2
- xma.l fp0b_1 = u_1, v0, r_1
- ;;
- getfsig acc1_2 = fp2a_2
- xma.hu fp1a_1 = u_1, v0, r_1
- br L(01)
+.Lb01: ldf8 r_0 = [srp], 8 C M
+ ldf8 u_0 = [up], 8 C M
+ mov acc1_1 = 0 C M I
+ mov pr1_1 = 0 C M I
+ mov pr0_2 = 0 C M I
+ cmp.ne p6, p7 = r0, r0 C M I
+ ;;
+ ldf8 r_1 = [srp], 8 C M
+ xma.l fp0b_2 = ux, v0, rx C F
+ cmp.ne p10, p11 = r0, r0 C M I
+ ldf8 u_1 = [up], 8 C M
+ xma.hu fp1a_2 = ux, v0, rx C F
+ ;;
+ xma.l fp0b_3 = uy, v0, ry C F
+ xma.hu fp1a_3 = uy, v0, ry C F
+ ;;
+ getf.sig acc0 = fp0b_2 C M
+ ldf8 r_2 = [srp], 8 C M
+ xma.l fp1b_2 = ux, v1,fp1a_2 C F
+ xma.hu fp2a_2 = ux, v1,fp1a_2 C F
+ ldf8 u_2 = [up], 8 C M
+ br.cloop.dptk .grt5
+
+ xma.l fp0b_0 = u_0, v0, r_0 C F
+ xma.hu fp1a_0 = u_0, v0, r_0 C F
+ ;;
+ getf.sig pr0_3 = fp0b_3 C M
+ xma.l fp1b_3 = uy, v1,fp1a_3 C F
+ xma.hu fp2a_3 = uy, v1,fp1a_3 C F
+ ;;
+ getf.sig pr1_2 = fp1b_2 C M
+ getf.sig acc1_2 = fp2a_2 C M
+ xma.l fp0b_1 = u_1, v0, r_1 C F
+ xma.hu fp1a_1 = u_1, v0, r_1 C F
+ br .Lcj5
+
+.grt5: xma.l fp0b_0 = u_0, v0, r_0
+ xma.hu fp1a_0 = u_0, v0, r_0
+ ;;
+ getf.sig pr0_3 = fp0b_3
+ ldf8 r_3 = [srp], 8
+ xma.l fp1b_3 = uy, v1, fp1a_3
+ xma.hu fp2a_3 = uy, v1, fp1a_3
+ ;;
+ ldf8 u_3 = [up], 8
+ getf.sig pr1_2 = fp1b_2
+ ;;
+ getf.sig acc1_2 = fp2a_2
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
+ br .LL01
ALIGN(32)
-L(b10): br.cloop.dptk L(gt2)
- xma.l fp0b_1 = ux, v0, rx
- xma.hu fp1b_1 = ux, v0, rx
- ;;
- xma.l fp0b_2 = uy, v0, ry
- xma.hu fp1a_2 = uy, v0, ry
- ;;
- stf8 [rp] = fp0b_1, 8
- (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
- (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
- ;;
- getfsig acc0 = fp0b_2
- xma.l fp1b_2 = uy, v1, fp1a_2
- xma.hu fp2a_2 = uy, v1, fp1a_2
- ;;
- getfsig pr1_1 = fp1b_1
- getfsig acc1_1 = fp2a_1
- mov ar.lc = r2
- getfsig pr1_2 = fp1b_2
- getfsig r8 = fp2a_2
- ;;
- add s0 = pr1_1, acc0
- ;;
- st8 [rp] = s0, 8
- cmp.ltu p8, p9 = s0, pr1_1
- sub r31 = -1, acc1_1
- ;;
- .pred.rel "mutex", p8, p9
- (p8) add acc0 = pr1_2, acc1_1, 1
- (p9) add acc0 = pr1_2, acc1_1
- (p8) cmp.leu p10, p0 = r31, pr1_2
- (p9) cmp.ltu p10, p0 = r31, pr1_2
- ;;
- st8 [rp] = acc0, 8
- (p10) add r8 = 1, r8
- br.ret.sptk.many b0
-
-
-L(gt2):
-.mmi; ldf8 r_3 = [srp], 8
- ldf8 u_3 = [up], 8
- mov acc1_0 = 0
- ;;
-.mfi; ldf8 r_0 = [srp], 8
- xma.l fp0b_1 = ux, v0, rx
- mov pr1_0 = 0
-.mfi; ldf8 u_0 = [up], 8
- xma.hu fp1b_1 = ux, v0, rx
- mov pr0_1 = 0
- ;;
- xma.l fp0b_2 = uy, v0, ry
- xma.hu fp1a_2 = uy, v0, ry
- ;;
- getfsig acc0 = fp0b_1
- ldf8 r_1 = [srp], 8
- (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
- (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
- ;;
- ldf8 u_1 = [up], 8
- xma.l fp0b_3 = u_3, v0, r_3
- xma.hu fp1a_3 = u_3, v0, r_3
- ;;
- getfsig pr0_2 = fp0b_2
- ldf8 r_2 = [srp], 8
- xma.l fp1b_2 = uy, v1, fp1a_2
- xma.hu fp2a_2 = uy, v1, fp1a_2
- ;;
- ldf8 u_2 = [up], 8
- getfsig pr1_1 = fp1b_1
- ;;
-.mfi; getfsig acc1_1 = fp2a_1
- xma.l fp0b_0 = u_0, v0, r_0
- cmp.ne p8, p9 = r0, r0
-.mfb; cmp.ne p12, p13 = r0, r0
- xma.hu fp1a_0 = u_0, v0, r_0
- br.cloop.sptk.clr L(top)
- br.many L(end)
+.Lb10: C 03
+ br.cloop.dptk .grt2
+ C 04
+ C 05
+ C 06
+ xma.l fp0b_1 = ux, v0, rx
+ xma.hu fp1a_1 = ux, v0, rx
+ ;; C 07
+ xma.l fp0b_2 = uy, v0, ry
+ xma.hu fp1a_2 = uy, v0, ry
+ ;; C 08
+ C 09
+ C 10
+ stf8 [rp] = fp0b_1, 8
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;; C 11
+ getf.sig acc0 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;; C 12
+ C 13
+ C 14
+ getf.sig pr1_1 = fp1b_1
+ C 15
+ getf.sig acc1_1 = fp2a_1
+ C 16
+ getf.sig pr1_2 = fp1b_2
+ C 17
+ getf.sig r8 = fp2a_2
+ ;; C 18
+ C 19
+ add s0 = pr1_1, acc0
+ ;; C 20
+ st8 [rp] = s0, 8
+ cmp.ltu p8, p9 = s0, pr1_1
+ sub r31 = -1, acc1_1
+ ;; C 21
+ .pred.rel "mutex", p8, p9
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ (p8) cmp.leu p10, p0 = r31, pr1_2
+ (p9) cmp.ltu p10, p0 = r31, pr1_2
+ ;; C 22
+ st8 [rp] = acc0, 8
+ mov.i ar.lc = r2
+ (p10) add r8 = 1, r8
+ br.ret.sptk.many b0
+
+
+.grt2: ldf8 r_3 = [srp], 8
+ ldf8 u_3 = [up], 8
+ mov acc1_0 = 0
+ ;;
+ ldf8 r_0 = [srp], 8
+ xma.l fp0b_1 = ux, v0, rx
+ mov pr1_0 = 0
+ ldf8 u_0 = [up], 8
+ xma.hu fp1a_1 = ux, v0, rx
+ mov pr0_1 = 0
+ ;;
+ xma.l fp0b_2 = uy, v0, ry
+ xma.hu fp1a_2 = uy, v0, ry
+ ;;
+ getf.sig acc0 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;;
+ ldf8 u_1 = [up], 8
+ xma.l fp0b_3 = u_3, v0, r_3
+ xma.hu fp1a_3 = u_3, v0, r_3
+ ;;
+ getf.sig pr0_2 = fp0b_2
+ ldf8 r_2 = [srp], 8
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ ldf8 u_2 = [up], 8
+ getf.sig pr1_1 = fp1b_1
+ ;;
+ getf.sig acc1_1 = fp2a_1
+ xma.l fp0b_0 = u_0, v0, r_0
+ cmp.ne p8, p9 = r0, r0
+ cmp.ne p12, p13 = r0, r0
+ xma.hu fp1a_0 = u_0, v0, r_0
+ br .LL10
ALIGN(32)
-L(b11): ldf8 r_2 = [srp], 8
- mov pr1_3 = 0
- mov pr0_0 = 0
+.Lb11: mov acc1_3 = 0
+ mov pr1_3 = 0
+ mov pr0_0 = 0
+ cmp.ne p6, p7 = r0, r0
;;
- ldf8 u_2 = [up], 8
- mov acc1_3 = 0
- br.cloop.dptk L(gt3)
+ ldf8 r_2 = [srp], 8
+ ldf8 u_2 = [up], 8
+ br.cloop.dptk .grt3
;;
- cmp.ne p6, p7 = r0, r0
- xma.l fp0b_0 = ux, v0, rx
- xma.hu fp1b_0 = ux, v0, rx
+ xma.l fp0b_0 = ux, v0, rx
+ xma.hu fp1a_0 = ux, v0, rx
;;
- cmp.ne p10, p11 = r0, r0
- xma.l fp0b_1 = uy, v0, ry
- xma.hu fp1a_1 = uy, v0, ry
+ cmp.ne p10, p11 = r0, r0
+ xma.l fp0b_1 = uy, v0, ry
+ xma.hu fp1a_1 = uy, v0, ry
;;
- getfsig acc0 = fp0b_0
- (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
- (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ getf.sig acc0 = fp0b_0
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ xma.hu fp2a_0 = ux, v1, fp1a_0
;;
- xma.l fp0b_2 = uy, v1, r_2
- xma.hu fp1a_2 = uy, v1, r_2
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
;;
- getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = u_2, v0, fp1a_1
- xma.hu fp2a_1 = u_2, v0, fp1a_1
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
;;
- getfsig pr1_0 = fp1b_0
- getfsig acc1_0 = fp2a_0
- br L(cj3)
+ getf.sig pr1_0 = fp1b_0
+ getf.sig acc1_0 = fp2a_0
+ br .Lcj3
-L(gt3): ldf8 r_3 = [srp], 8
- xma.l fp0b_0 = ux, v0, rx
- cmp.ne p10, p11 = r0, r0
- ldf8 u_3 = [up], 8
- xma.hu fp1b_0 = ux, v0, rx
- cmp.ne p6, p7 = r0, r0
+.grt3: ldf8 r_3 = [srp], 8
+ xma.l fp0b_0 = ux, v0, rx
+ cmp.ne p10, p11 = r0, r0
+ ldf8 u_3 = [up], 8
+ xma.hu fp1a_0 = ux, v0, rx
;;
- xma.l fp0b_1 = uy, v0, ry
- xma.hu fp1a_1 = uy, v0, ry
+ xma.l fp0b_1 = uy, v0, ry
+ xma.hu fp1a_1 = uy, v0, ry
;;
- getfsig acc0 = fp0b_0
- ldf8 r_0 = [srp], 8
- (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
- ldf8 u_0 = [up], 8
- (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ getf.sig acc0 = fp0b_0
+ ldf8 r_0 = [srp], 8
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ ldf8 u_0 = [up], 8
+ xma.hu fp2a_0 = ux, v1, fp1a_0
;;
- xma.l fp0b_2 = u_2, v0, r_2
- xma.hu fp1a_2 = u_2, v0, r_2
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
;;
- getfsig pr0_1 = fp0b_1
- ldf8 r_1 = [srp], 8
- xma.l fp1b_1 = uy, v1, fp1a_1
- xma.hu fp2a_1 = uy, v1, fp1a_1
+ getf.sig pr0_1 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
;;
- ldf8 u_1 = [up], 8
- getfsig pr1_0 = fp1b_0
+ ldf8 u_1 = [up], 8
+ getf.sig pr1_0 = fp1b_0
;;
- getfsig acc1_0 = fp2a_0
- xma.l fp0b_3 = u_3, v0, r_3
- xma.hu fp1a_3 = u_3, v0, r_3
- br L(11)
+ getf.sig acc1_0 = fp2a_0
+ xma.l fp0b_3 = u_3, v0, r_3
+ xma.hu fp1a_3 = u_3, v0, r_3
+ br .LL11
C *** MAIN LOOP START ***
ALIGN(32)
-L(top): C 00
- .pred.rel "mutex", p12, p13
- getfsig pr0_3 = fp0b_3
- ldf8 r_3 = [srp], 8
- xma.l fp1b_3 = u_3, v1, fp1a_3
- (p12) add s0 = pr1_0, acc0, 1
- (p13) add s0 = pr1_0, acc0
- xma.hu fp2a_3 = u_3, v1, fp1a_3
+.Loop: C 00
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_3 = fp0b_3
+ ldf8 r_3 = [srp], 8
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
;; C 01
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
- ldf8 u_3 = [up], 8
- getfsig pr1_2 = fp1b_2
- (p8) cmp.leu p6, p7 = acc0, pr0_1
- (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_3 = [up], 8
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
;; C 02
- .pred.rel "mutex", p6, p7
- getfsig acc1_2 = fp2a_2
- st8 [rp] = s0, 8
- xma.l fp0b_1 = u_1, v0, r_1
- (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- xma.hu fp1a_1 = u_1, v0, r_1
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, r_1
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, r_1
;; C 03
-L(01):
- .pred.rel "mutex", p10, p11
- getfsig pr0_0 = fp0b_0
- ldf8 r_0 = [srp], 8
- xma.l fp1b_0 = u_0, v1, fp1a_0
- (p10) add s0 = pr1_1, acc0, 1
- (p11) add s0 = pr1_1, acc0
- xma.hu fp2a_0 = u_0, v1, fp1a_0
+.LL01:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_0 = fp0b_0
+ ldf8 r_0 = [srp], 8
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
;; C 04
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
- ldf8 u_0 = [up], 8
- getfsig pr1_3 = fp1b_3
- (p6) cmp.leu p8, p9 = acc0, pr0_2
- (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_0 = [up], 8
+ getf.sig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
;; C 05
- .pred.rel "mutex", p8, p9
- getfsig acc1_3 = fp2a_3
- st8 [rp] = s0, 8
- xma.l fp0b_2 = u_2, v0, r_2
- (p8) add acc0 = pr0_3, acc1_1, 1
- (p9) add acc0 = pr0_3, acc1_1
- xma.hu fp1a_2 = u_2, v0, r_2
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, r_2
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, r_2
;; C 06
-L(00):
- .pred.rel "mutex", p12, p13
- getfsig pr0_1 = fp0b_1
- ldf8 r_1 = [srp], 8
- xma.l fp1b_1 = u_1, v1, fp1a_1
- (p12) add s0 = pr1_2, acc0, 1
- (p13) add s0 = pr1_2, acc0
- xma.hu fp2a_1 = u_1, v1, fp1a_1
+.LL00:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_1 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
;; C 07
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
- ldf8 u_1 = [up], 8
- getfsig pr1_0 = fp1b_0
- (p8) cmp.leu p6, p7 = acc0, pr0_3
- (p9) cmp.ltu p6, p7 = acc0, pr0_3
- (p12) cmp.leu p10, p11 = s0, pr1_2
- (p13) cmp.ltu p10, p11 = s0, pr1_2
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_1 = [up], 8
+ getf.sig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
;; C 08
- .pred.rel "mutex", p6, p7
- getfsig acc1_0 = fp2a_0
- st8 [rp] = s0, 8
- xma.l fp0b_3 = u_3, v0, r_3
- (p6) add acc0 = pr0_0, acc1_2, 1
- (p7) add acc0 = pr0_0, acc1_2
- xma.hu fp1a_3 = u_3, v0, r_3
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ xma.l fp0b_3 = u_3, v0, r_3
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ xma.hu fp1a_3 = u_3, v0, r_3
;; C 09
-L(11):
- .pred.rel "mutex", p10, p11
- getfsig pr0_2 = fp0b_2
- ldf8 r_2 = [srp], 8
- xma.l fp1b_2 = u_2, v1, fp1a_2
- (p10) add s0 = pr1_3, acc0, 1
- (p11) add s0 = pr1_3, acc0
- xma.hu fp2a_2 = u_2, v1, fp1a_2
+.LL11:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_2 = fp0b_2
+ ldf8 r_2 = [srp], 8
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
;; C 10
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
- ldf8 u_2 = [up], 8
- getfsig pr1_1 = fp1b_1
- (p6) cmp.leu p8, p9 = acc0, pr0_0
- (p7) cmp.ltu p8, p9 = acc0, pr0_0
- (p10) cmp.leu p12, p13 = s0, pr1_3
- (p11) cmp.ltu p12, p13 = s0, pr1_3
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_2 = [up], 8
+ getf.sig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
;; C 11
- .pred.rel "mutex", p8, p9
- getfsig acc1_1 = fp2a_1
- st8 [rp] = s0, 8
- xma.l fp0b_0 = u_0, v0, r_0
- (p8) add acc0 = pr0_1, acc1_3, 1
- (p9) add acc0 = pr0_1, acc1_3
- xma.hu fp1a_0 = u_0, v0, r_0
-L(10): br.cloop.sptk.clr L(top) C 12
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ xma.l fp0b_0 = u_0, v0, r_0
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ xma.hu fp1a_0 = u_0, v0, r_0
+.LL10: br.cloop.dptk .Loop C 12
;;
C *** MAIN LOOP END ***
-L(end):
- .pred.rel "mutex", p12, p13
-.mfi; getfsig pr0_3 = fp0b_3
- xma.l fp1b_3 = u_3, v1, fp1a_3
- (p12) add s0 = pr1_0, acc0, 1
-.mfi; (p13) add s0 = pr1_0, acc0
- xma.hu fp2a_3 = u_3, v1, fp1a_3
- nop 1
- ;;
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_2 = fp1b_2
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_1
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
- ;;
- .pred.rel "mutex", p6, p7
-.mfi; getfsig acc1_2 = fp2a_2
- xma.l fp0b_1 = u_1, v0, r_1
- nop 1
-.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- xma.hu fp1a_1 = u_1, v0, r_1
- ;;
-L(cj5):
- .pred.rel "mutex", p10, p11
-.mfi; getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = u_0, v1, fp1a_0
- (p10) add s0 = pr1_1, acc0, 1
-.mfi; (p11) add s0 = pr1_1, acc0
- xma.hu fp2a_0 = u_0, v1, fp1a_0
- nop 1
- ;;
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
-.mmi; getfsig pr1_3 = fp1b_3
- st8 [rp] = s0, 8
- (p6) cmp.leu p8, p9 = acc0, pr0_2
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mfi; getfsig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, r_2
- nop 1
-.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
- (p9) add acc0 = pr0_3, acc1_1
- xma.hu fp1a_2 = u_2, v0, r_2
- ;;
-L(cj4):
- .pred.rel "mutex", p12, p13
-.mfi; getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = u_1, v1, fp1a_1
- (p12) add s0 = pr1_2, acc0, 1
-.mfi; (p13) add s0 = pr1_2, acc0
- xma.hu fp2a_1 = u_1, v1, fp1a_1
- nop 1
- ;;
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_0 = fp1b_0
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_3
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
- (p12) cmp.leu p10, p11 = s0, pr1_2
- (p13) cmp.ltu p10, p11 = s0, pr1_2
- ;;
- .pred.rel "mutex", p6, p7
-.mmi; getfsig acc1_0 = fp2a_0
- (p6) add acc0 = pr0_0, acc1_2, 1
- (p7) add acc0 = pr0_0, acc1_2
- ;;
-L(cj3):
- .pred.rel "mutex", p10, p11
-.mfi; getfsig pr0_2 = fp0b_2
- xma.l fp1b_2 = u_2, v1, fp1a_2
- (p10) add s0 = pr1_3, acc0, 1
-.mfi; (p11) add s0 = pr1_3, acc0
- xma.hu fp2a_2 = u_2, v1, fp1a_2
- nop 1
- ;;
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
-.mmi; getfsig pr1_1 = fp1b_1
- st8 [rp] = s0, 8
- (p6) cmp.leu p8, p9 = acc0, pr0_0
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
- (p10) cmp.leu p12, p13 = s0, pr1_3
- (p11) cmp.ltu p12, p13 = s0, pr1_3
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; getfsig acc1_1 = fp2a_1
- (p8) add acc0 = pr0_1, acc1_3, 1
- (p9) add acc0 = pr0_1, acc1_3
- ;;
- .pred.rel "mutex", p12, p13
-.mmi; (p12) add s0 = pr1_0, acc0, 1
- (p13) add s0 = pr1_0, acc0
- nop 1
- ;;
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_2 = fp1b_2
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_1
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
- ;;
- .pred.rel "mutex", p6, p7
-.mmi; getfsig r8 = fp2a_2
- (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- ;;
- .pred.rel "mutex", p10, p11
-.mmi; (p10) add s0 = pr1_1, acc0, 1
- (p11) add s0 = pr1_1, acc0
- (p6) cmp.leu p8, p9 = acc0, pr0_2
- ;;
- .pred.rel "mutex", p10, p11
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; st8 [rp] = s0, 8
- (p8) add acc0 = pr1_2, acc1_1, 1
- (p9) add acc0 = pr1_2, acc1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
- (p9) cmp.ltu p10, p11 = acc0, pr1_2
- (p12) add acc0 = 1, acc0
- ;;
-.mmi; st8 [rp] = acc0, 8
- (p12) cmpeqor p10, p0 = 0, acc0
- nop 1
- ;;
-.mib; (p10) add r8 = 1, r8
- mov ar.lc = r2
- br.ret.sptk.many b0
+
+.Lcj6:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_3 = fp0b_3
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, r_1
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+.Lcj5:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ getf.sig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, r_2
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, r_2
+ ;;
+.Lcj4:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ ;;
+.Lcj3:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ getf.sig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;;
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ ;;
+.Lcj2:
+ .pred.rel "mutex", p12, p13
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ ;;
+ .pred.rel "mutex", p10, p11
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ st8 [rp] = s0, 8
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) cmp.leu p10, p11 = acc0, pr1_2
+ (p9) cmp.ltu p10, p11 = acc0, pr1_2
+ (p12) add acc0 = 1, acc0
+ ;;
+ st8 [rp] = acc0, 8
+ (p12) cmp.eq.or p10, p0 = 0, acc0
+ mov r8 = acc1_2
+ ;;
+ .pred.rel "mutex", p10, p11
+ (p10) add r8 = 1, r8
+ mov.i ar.lc = r2
+ br.ret.sptk.many b0
EPILOGUE()
ASM_END()
diff --git a/gmp/mpn/ia64/aors_n.asm b/gmp/mpn/ia64/aors_n.asm
index 81be606190..fd3aaac460 100644
--- a/gmp/mpn/ia64/aors_n.asm
+++ b/gmp/mpn/ia64/aors_n.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -39,818 +26,586 @@ C Itanium 2: 1.25
C TODO
C * Consider using special code for small n, using something like
C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
-C * The non-nc code was trimmed cycle for cycle to its current state. It is
-C probably hard to save more that an odd cycle there. The nc code is much
-C cruder (since tune/speed doesn't have any applicable direct measurements).
-C * Without the nc entry points, this becomes around 1800 bytes of object
-C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a
-C few cycles for the non-nc code and let it fall into the nc code.
C INPUT PARAMETERS
-define(`rp', `r32')
-define(`up', `r33')
-define(`vp', `r34')
-define(`n', `r35')
-define(`cy', `r36')
+define(`rp',`r32')
+define(`up',`r33')
+define(`vp',`r34')
+define(`n',`r35')
ifdef(`OPERATION_add_n',`
define(ADDSUB, add)
- define(CND, ltu)
+ define(PRED, ltu)
define(INCR, 1)
define(LIM, -1)
- define(LIM2, 0)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)
+ define(func, mpn_add_n)
')
ifdef(`OPERATION_sub_n',`
define(ADDSUB, sub)
- define(CND, gtu)
+ define(PRED, gtu)
define(INCR, -1)
define(LIM, 0)
- define(LIM2, -1)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)
+ define(func, mpn_sub_n)
')
-define(cmpeqor, `cmp.eq.or')
-define(PFDIST, 500)
-
C Some useful aliases for registers we use
define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`u4',`r18') define(`u5',`r19') define(`u6',`r20') define(`u7',`r21')
define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
-define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`v4',`r28') define(`v5',`r29') define(`v6',`r30') define(`v7',`r31')
+define(`w0',`r22') define(`w1',`r9') define(`w2',`r8') define(`w3',`r23')
+define(`w4',`r22') define(`w5',`r9') define(`w6',`r8') define(`w7',`r23')
define(`rpx',`r3')
-define(`upadv',`r20') define(`vpadv',`r21')
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
ASM_START()
-PROLOGUE(func_nc)
+PROLOGUE(func)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',`
- addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- nop.i 0
- addp4 vp = 0, vp C M I
- nop.m 0
- zxt4 n = n C I
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
;;
')
-
- {.mmi; ld8 r11 = [vp], 8 C M01
- ld8 r10 = [up], 8 C M01
- mov r2 = ar.lc C I0
-}{.mmi; and r14 = 7, n C M I
- cmp.lt p15, p14 = 8, n C M I
- add n = -6, n C M I
+{.mmi C 00
+ ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov.i r2 = ar.lc C I0
+}
+{.mmi
+ and r14 = 7, n C M I
+ cmp.lt p15, p14 = 8, n C M I
+ add n = -8, n C M I
;;
}
-.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in
- add vpadv = PFDIST, vp C code could save a cycle per call at
- mov r23 = cy C the expense of code size.
- ;;
-{.mmi; cmp.eq p6, p0 = 1, r14 C M I
- cmp.eq p7, p0 = 2, r14 C M I
- cmp.eq p8, p0 = 3, r14 C M I
-}{.bbb
- (p6) br.dptk .Lc001 C B
- (p7) br.dptk .Lc010 C B
- (p8) br.dptk .Lc011 C B
- ;;
-}{.mmi; cmp.eq p9, p0 = 4, r14 C M I
- cmp.eq p10, p0 = 5, r14 C M I
- cmp.eq p11, p0 = 6, r14 C M I
-}{.bbb
- (p9) br.dptk .Lc100 C B
- (p10) br.dptk .Lc101 C B
- (p11) br.dptk .Lc110 C B
- ;;
-}{.mmi; ld8 r19 = [vp], 8 C M01
- ld8 r18 = [up], 8 C M01
- cmp.ne p13, p0 = 0, cy C copy cy to p13 M I
-}{.mmb; cmp.eq p12, p0 = 7, r14 C M I
- nop 0
- (p12) br.dptk .Lc111 C B
+{.mmi C 01
+ cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}
+{.bbb
+ (p6) br.dptk .Lb001 C B
+ (p7) br.dptk .Lb010 C B
+ (p8) br.dptk .Lb011 C B
;;
}
+{.mmi C 02
+ cmp.eq p9, p0 = 4, r14 C M I
+ cmp.eq p10, p0 = 5, r14 C M I
+ cmp.eq p11, p0 = 6, r14 C M I
+}
+{.bbb
+ (p9) br.dptk .Lb100 C B
+ (p10) br.dptk .Lb101 C B
+ (p11) br.dptk .Lb110 C B
+ ;;
+} C 03
+{.mmb
+ cmp.eq p12, p0 = 7, r14 C M I
+ add n = -1, n C loop count M I
+ (p12) br.dptk .Lb111 C B
+}
-.Lc000:
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; add vpadv = PFDIST, vp C M I
- ld8 v0 = [vp], 8 C M01
- mov ar.lc = n C I0
-.mmi; ld8 u0 = [up], 8 C M01
- ADDSUB w1 = r10, r11 C M I
- nop 0
- ;;
-.mmi; add upadv = PFDIST, up C M I
- ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, r10 C M I
-.mmi; ld8 u1 = [up], 8 C M01
- ADDSUB w2 = r18, r19 C M I
- add rpx = 8, rp C M I
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, r18 C M I
- (p13) cmpeqor p7, p0 = LIM, w1 C M I
-.mmi; ld8 u2 = [up], 8 C M01
- (p13) add w1 = INCR, w1 C M I
- ADDSUB w3 = u3, v3 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- br L(m0)
-
-
-.Lc001:
-.mmi;
- (p15) ld8 v1 = [vp], 8 C M01
- (p15) ld8 u1 = [up], 8 C M01
- ADDSUB w0 = r10, r11 C M I
-.mmb; nop 0
- nop 0
- (p15) br 1f
- ;;
-.mmi; cmp.ne p9, p0 = 0, r23 C M I
- mov r8 = 0
- cmp.CND p6, p0 = w0, r10 C M I
- ;;
-.mmb;
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- (p9) add w0 = INCR, w0 C M I
- br L(cj1) C B
-1:
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- mov ar.lc = n C I0
-.mmi; nop 0
- cmp.ne p9, p0 = 0, r23 C M I
- nop 0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- cmp.CND p6, p0 = w0, r10 C M I
- add rpx = 16, rp C M I
-.mmb; ld8 u0 = [up], 8 C M01
- ADDSUB w1 = u1, v1 C M I
- br L(c1) C B
-
-
-.Lc010:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- mov r8 = 0 C M I
-.mmb; ADDSUB w3 = r10, r11 C M I
- cmp.ne p8, p0 = 0, r23 C M I
- (p15) br 1f C B
- ;;
-.mmi; cmp.CND p9, p0 = w3, r10 C M I
- ADDSUB w0 = u0, v0 C M I
- (p8) add w3 = INCR, w3 C M I
- ;;
-.mmb; cmp.CND p6, p0 = w0, u0 C M I
- (p8) cmpeqor p9, p0 = LIM2, w3 C M I
- br L(cj2) C B
-1:
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- mov ar.lc = n C I0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- cmp.CND p9, p0 = w3, r10 C M I
- ;;
-.mmi;
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
-.mmb; add rpx = 24, rp C M I
- nop 0
- br L(m23) C B
-
-
-.Lc011:
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shr.u n = n, 3 C I0
-.mmi; ADDSUB w2 = r10, r11 C M I
- cmp.ne p7, p0 = 0, r23 C M I
- nop 0
- ;;
-.mmb; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- (p15) br 1f C B
-.mmi; cmp.CND p8, p0 = w2, r10 C M I
- ADDSUB w3 = u3, v3 C M I
- nop 0
- ;;
-.mmb;
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
- (p7) add w2 = INCR, w2 C M I
- br L(cj3) C B
-1:
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- ADDSUB w3 = u3, v3 C M I
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- cmp.CND p8, p0 = w2, r10 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- mov ar.lc = n C I0
-.mmi; ld8 u3 = [up], 8 C M01
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
- (p7) add w2 = INCR, w2 C M I
- ;;
-.mmi; add rpx = 32, rp C M I
- st8 [rp] = w2, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
-.mmb;
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- br L(m23)
-
-
-.Lc100:
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shr.u n = n, 3 C I0
-.mmi; ADDSUB w1 = r10, r11 C M I
- nop 0
- nop 0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- add rpx = 8, rp C M I
-.mmi; cmp.ne p6, p0 = 0, r23 C M I
- cmp.CND p7, p0 = w1, r10 C M I
- nop 0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- ADDSUB w2 = u2, v2 C M I
-.mmb;
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- (p6) add w1 = INCR, w1 C M I
- (p14) br L(cj4)
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- mov ar.lc = n C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, u2 C M I
- nop 0
-.mmi; ld8 u2 = [up], 8 C M01
- nop 0
- ADDSUB w3 = u3, v3 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- br L(m4)
-
-
-.Lc101:
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- mov ar.lc = n C I0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- ADDSUB w0 = r10, r11 C M I
-.mmi; cmp.ne p9, p0 = 0, r23 C M I
- add rpx = 16, rp C M I
- nop 0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- cmp.CND p6, p0 = w0, r10 C M I
-.mbb; ADDSUB w1 = u1, v1 C M I
- (p15) br L(c5) C B
- br L(end) C B
-
-
-.Lc110:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; add upadv = PFDIST, up C M I
- add vpadv = PFDIST, vp C M I
- mov ar.lc = n C I0
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- ADDSUB w3 = r10, r11 C M I
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- ADDSUB w0 = u0, v0 C M I
-.mmi; cmp.CND p9, p0 = w3, r10 C M I
- cmp.ne p8, p0 = 0, r23 C M I
- add rpx = 24, rp C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- nop 0
-.mmb;
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- (p8) add w3 = INCR, w3 C M I
- br L(m67) C B
-
-
-.Lc111:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; add upadv = PFDIST, up C M I
- ld8 v1 = [vp], 8 C M01
- mov ar.lc = n C I0
-.mmi; ld8 u1 = [up], 8 C M01
- ADDSUB w2 = r10, r11 C M I
- nop 0
- ;;
-.mmi; add vpadv = PFDIST, vp C M I
- ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, r10 C M I
-.mmi; ld8 u2 = [up], 8 C M01
- ADDSUB w3 = r18, r19 C M I
- nop 0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, r18 C M I
- (p13) cmpeqor p8, p0 = LIM, w2 C M I
-.mmi; ld8 u3 = [up], 8 C M01
- (p13) add w2 = INCR, w2 C M I
- nop 0
- ;;
-.mmi; add rpx = 32, rp C M I
- st8 [rp] = w2, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
-.mmb;
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- br L(m67)
-EPILOGUE()
-
-PROLOGUE(func)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
- addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- nop.i 0
- addp4 vp = 0, vp C M I
- nop.m 0
- zxt4 n = n C I
+.Lb000: ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ add rpx = 8, rp C M I
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ADDSUB w1 = r10, r11 C M I
;;
-')
-
- {.mmi; ld8 r11 = [vp], 8 C M01
- ld8 r10 = [up], 8 C M01
- mov r2 = ar.lc C I0
-}{.mmi; and r14 = 7, n C M I
- cmp.lt p15, p14 = 8, n C M I
- add n = -6, n C M I
- ;;
-}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
- cmp.eq p7, p0 = 2, r14 C M I
- cmp.eq p8, p0 = 3, r14 C M I
-}{.bbb
- (p6) br.dptk .Lb001 C B
- (p7) br.dptk .Lb010 C B
- (p8) br.dptk .Lb011 C B
- ;;
-}{.mmi; cmp.eq p9, p0 = 4, r14 C M I
- cmp.eq p10, p0 = 5, r14 C M I
- cmp.eq p11, p0 = 6, r14 C M I
-}{.bbb
- (p9) br.dptk .Lb100 C B
- (p10) br.dptk .Lb101 C B
- (p11) br.dptk .Lb110 C B
- ;;
-}{.mmi; ld8 r19 = [vp], 8 C M01
- ld8 r18 = [up], 8 C M01
- cmp.ne p13, p0 = r0, r0 C clear "CF" M I
-}{.mmb; cmp.eq p12, p0 = 7, r14 C M I
- mov r23 = 0 C M I
- (p12) br.dptk .Lb111 C B
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ cmp.PRED p7, p0 = w1, r10 C M I
;;
-}
-
-.Lb000:
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- ADDSUB w1 = r10, r11 C M I
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, r10 C M I
- mov ar.lc = n C I0
-.mmi; ld8 u1 = [up], 8 C M01
- ADDSUB w2 = r18, r19 C M I
- add rpx = 8, rp C M I
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- cmp.CND p8, p0 = w2, r18 C M I
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- ADDSUB w3 = u3, v3 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- br L(m0) C B
-
-
- ALIGN(32)
-.Lb001:
-.mmi; ADDSUB w0 = r10, r11 C M I
- (p15) ld8 v1 = [vp], 8 C M01
- mov r8 = 0 C M I
- ;;
-.mmb; cmp.CND p6, p0 = w0, r10 C M I
- (p15) ld8 u1 = [up], 8 C M01
- (p14) br L(cj1) C B
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- shr.u n = n, 3 C I0
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- cmp.CND p6, p0 = w0, r10 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- mov ar.lc = n C I0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- ADDSUB w1 = u1, v1 C M I
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, u1 C M I
- ADDSUB w2 = u2, v2 C M I
-.mmb; ld8 u1 = [up], 8 C M01
- add rpx = 16, rp C M I
- br L(m1) C B
-
-
- ALIGN(32)
-.Lb010:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 3 C I0
-.mmb; ADDSUB w3 = r10, r11 C M I
- nop 0
- (p15) br L(gt2) C B
- ;;
-.mmi; cmp.CND p9, p0 = w3, r10 C M I
- ADDSUB w0 = u0, v0 C M I
- mov r8 = 0 C M I
- ;;
-.mmb; nop 0
- cmp.CND p6, p0 = w0, u0 C M I
- br L(cj2) C B
-L(gt2):
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- nop 0
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- mov ar.lc = n C I0
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- nop 0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, r10 C M I
- ADDSUB w0 = u0, v0 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- add rpx = 24, rp C M I
- br L(m23) C B
-
-
- ALIGN(32)
-.Lb011:
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- ADDSUB w2 = r10, r11 C M I
- ;;
-.mmb; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- (p15) br 1f C B
-.mmb; cmp.CND p8, p0 = w2, r10 C M I
- ADDSUB w3 = u3, v3 C M I
- br L(cj3) C B
-1:
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- ADDSUB w3 = u3, v3 C M I
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- cmp.CND p8, p0 = w2, r10 C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- mov ar.lc = n C I0
-.mmi; ld8 u3 = [up], 8 C M01
- nop 0
- nop 0
- ;;
-.mmi; add rpx = 32, rp C M I
- st8 [rp] = w2, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
-.mmb;
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- br L(m23) C B
-
-
- ALIGN(32)
-.Lb100:
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- ADDSUB w1 = r10, r11 C M I
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- cmp.CND p7, p0 = w1, r10 C M I
-.mmb; nop 0
- ADDSUB w2 = u2, v2 C M I
- (p14) br L(cj4) C B
- ;;
-L(gt4):
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- mov ar.lc = n C I0
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- nop 0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, u2 C M I
- nop 0
-.mmi; ld8 u2 = [up], 8 C M01
- ADDSUB w3 = u3, v3 C M I
- add rpx = 8, rp C M I
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- br L(m4) C B
-
-
- ALIGN(32)
-.Lb101:
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- ADDSUB w0 = r10, r11 C M I
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- add rpx = 16, rp C M I
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- nop 0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- cmp.CND p6, p0 = w0, r10 C M I
- nop 0
-.mmb; ld8 u0 = [up], 8 C M01
- ADDSUB w1 = u1, v1 C M I
- (p14) br L(cj5) C B
- ;;
-L(gt5):
-.mmi; ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, u1 C M I
- mov ar.lc = n C I0
-.mmb; ld8 u1 = [up], 8 C M01
- ADDSUB w2 = u2, v2 C M I
- br L(m5) C B
-
-
- ALIGN(32)
-.Lb110:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- ADDSUB w3 = r10, r11 C M I
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- mov ar.lc = n C I0
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- nop 0
- ;;
-.mmi; ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, r10 C M I
- ADDSUB w0 = u0, v0 C M I
-.mmb; ld8 u3 = [up], 8 C M01
- add rpx = 24, rp C M I
- br L(m67) C B
-
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ cmp.PRED p8, p0 = w2, u2 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ cmp.PRED p9, p0 = w3, u3 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
+ (p7) add w2 = INCR, w2 C M I
+ (p14) br.cond.dptk .Lcj8 C B
+ ;;
+
+.grt8: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+ add r11 = 512, vp
+ ld8 v2 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u2 = [up], 8 C M01
+ nop.i 0
+ nop.b 0
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ br .LL000 C B
- ALIGN(32)
-.Lb111:
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 3 C I0
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- ADDSUB w2 = r10, r11 C M I
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, r10 C M I
- mov ar.lc = n C I0
-.mmi; ld8 u2 = [up], 8 C M01
- ADDSUB w3 = r18, r19 C M I
- nop 0
- ;;
-.mmi; add upadv = PFDIST, up
- add vpadv = PFDIST, vp
- nop 0
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- cmp.CND p9, p0 = w3, r18 C M I
- ;;
-.mmi; add rpx = 32, rp C M I
- st8 [rp] = w2, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
-.mmb;
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- br L(m67) C B
+.Lb001: add rpx = 16, rp C M I
+ ADDSUB w0 = r10, r11 C M I
+ (p15) br.cond.dpnt .grt1 C B
+ ;;
+ cmp.PRED p6, p0 = w0, r10 C M I
+ mov r8 = 0 C M I
+ br .Lcj1 C B
+.grt1: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.ne p9, p0 = r0, r0 C read near Loop
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ cmp.PRED p6, p0 = w0, r10 C M I
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ cmp.PRED p7, p0 = w1, u1 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v0 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u0 = [up], 8 C M01
+ br.cloop.dptk .Loop C B
+ br .Lcj9 C B
+
+.Lb010: ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ add rpx = 24, rp C M I
+ ADDSUB w7 = r10, r11 C M I
+ (p15) br.cond.dpnt .grt2 C B
+ ;;
+ cmp.PRED p9, p0 = w7, r10 C M I
+ ADDSUB w0 = u0, v0 C M I
+ br .Lcj2 C B
+
+.grt2: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ cmp.PRED p9, p0 = w7, r10 C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v7 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u7 = [up], 8 C M01
+ br .LL01x C B
+
+.Lb011: ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ ADDSUB w6 = r10, r11 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ (p15) br.cond.dpnt .grt3 C B
+ ;;
+ cmp.PRED p8, p0 = w6, r10 C M I
+ ADDSUB w7 = u7, v7 C M I
+ ;;
+ st8 [rp] = w6, 8 C M23
+ cmp.PRED p9, p0 = w7, u7 C M I
+ br .Lcj3 C B
+
+.grt3: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ add rpx = 32, rp C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ cmp.PRED p8, p0 = w6, r10 C M I
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ADDSUB w7 = u7, v7 C M I
+ nop.i 0
+ nop.b 0
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ cmp.PRED p9, p0 = w7, u7 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v6 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u6 = [up], 8 C M01
+ (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ (p8) add w7 = INCR, w7 C M I
+ st8 [rp] = w6, 8 C M23
+ ADDSUB w0 = u0, v0 C M I
+ br .LL01x C B
+
+.Lb100: ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ add rpx = 8, rp C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ ADDSUB w5 = r10, r11 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ (p15) br.cond.dpnt .grt4 C B
+ ;;
+ cmp.PRED p7, p0 = w5, r10 C M I
+ ADDSUB w6 = u6, v6 C M I
+ ;;
+ cmp.PRED p8, p0 = w6, u6 C M I
+ ADDSUB w7 = u7, v7 C M I
+ br .Lcj4 C B
+
+.grt4: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ cmp.PRED p7, p0 = w5, r10 C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w6 = u6, v6 C M I
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ cmp.PRED p8, p0 = w6, u6 C M I
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ ADDSUB w7 = u7, v7 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v6 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u6 = [up], 8 C M01
+ cmp.PRED p9, p0 = w7, u7 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
+ (p7) add w6 = INCR, w6 C M I
+ br .LL100 C B
+
+.Lb101: ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ add rpx = 16, rp C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ ADDSUB w4 = r10, r11 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ cmp.PRED p6, p0 = w4, r10 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w5 = u5, v5 C M I
+ shr.u n = n, 3 C I0
+ (p15) br.cond.dpnt .grt5 C B
+ ;;
+ cmp.PRED p7, p0 = w5, u5 C M I
+ ADDSUB w6 = u6, v6 C M I
+ br .Lcj5 C B
+
+.grt5: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ cmp.PRED p7, p0 = w5, u5 C M I
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ ADDSUB w6 = u6, v6 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v5 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u5 = [up], 8 C M01
+ br .LL101 C B
+
+.Lb110: ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ add rpx = 24, rp C M I
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ ADDSUB w3 = r10, r11 C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ cmp.PRED p9, p0 = w3, r10 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w4 = u4, v4 C M I
+ (p14) br.cond.dptk .Lcj67 C B
+ ;;
+
+.grt6: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ cmp.PRED p9, p0 = w3, r10 C M I
+ nop.i 0
+ nop.b 0
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w4 = u4, v4 C M I
+ ;;
+ add r11 = 512, vp
+ ld8 v3 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u3 = [up], 8 C M01
+ br .LL11x C B
+
+.Lb111: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ add rpx = 32, rp C M I
+ ;;
+ ld8 v4 = [vp], 8 C M01
+ ld8 u4 = [up], 8 C M01
+ ADDSUB w2 = r10, r11 C M I
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ ld8 u5 = [up], 8 C M01
+ cmp.PRED p8, p0 = w2, r10 C M I
+ ;;
+ ld8 v6 = [vp], 8 C M01
+ ld8 u6 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ ld8 v7 = [vp], 8 C M01
+ ld8 u7 = [up], 8 C M01
+ cmp.PRED p9, p0 = w3, u3 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ (p15) br.cond.dpnt .grt7 C B
+ ;;
+ st8 [rp] = w2, 8 C M23
+ (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w4 = u4, v4 C M I
+ br .Lcj67 C B
+
+.grt7: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
+ nop.i 0
+ nop.b 0
+ ;;
+ add r11 = 512, vp
+ ld8 v2 = [vp], 8 C M01
+ add r10 = 512, up
+ ld8 u2 = [up], 8 C M01
+ (p8) add w3 = INCR, w3 C M I
+ nop.b 0
+ ;;
+ ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ st8 [rp] = w2, 8 C M23
+ ADDSUB w4 = u4, v4 C M I
+ br .LL11x C B
C *** MAIN LOOP START ***
ALIGN(32)
-L(top):
-L(c5): ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, u1 C M I
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- ld8 u1 = [up], 8 C M01
- (p9) add w0 = INCR, w0 C M I
- ADDSUB w2 = u2, v2 C M I
- ;;
-L(m5): ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, u2 C M I
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- ld8 u2 = [up], 8 C M01
- (p6) add w1 = INCR, w1 C M I
- ADDSUB w3 = u3, v3 C M I
- ;;
- st8 [rp] = w0, 8 C M23
- ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
- ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- ;;
-L(m4): st8 [rp] = w1, 16 C M23
- st8 [rpx] = w2, 32 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- lfetch [upadv], 64
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- ;;
-L(m23): st8 [rp] = w3, 8 C M23
- ld8 v0 = [vp], 8 C M01
- cmp.CND p6, p0 = w0, u0 C M I
- ld8 u0 = [up], 8 C M01
- ADDSUB w1 = u1, v1 C M I
- nop.b 0
- ;;
-L(c1): ld8 v1 = [vp], 8 C M01
- cmp.CND p7, p0 = w1, u1 C M I
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- ld8 u1 = [up], 8 C M01
- (p9) add w0 = INCR, w0 C M I
- ADDSUB w2 = u2, v2 C M I
- ;;
-L(m1): ld8 v2 = [vp], 8 C M01
- cmp.CND p8, p0 = w2, u2 C M I
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- ld8 u2 = [up], 8 C M01
- (p6) add w1 = INCR, w1 C M I
- ADDSUB w3 = u3, v3 C M I
- ;;
- st8 [rp] = w0, 8 C M23
- ld8 v3 = [vp], 8 C M01
- cmp.CND p9, p0 = w3, u3 C M I
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
- ld8 u3 = [up], 8 C M01
- (p7) add w2 = INCR, w2 C M I
- ;;
-L(m0): st8 [rp] = w1, 16 C M23
- st8 [rpx] = w2, 32 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- lfetch [vpadv], 64
- (p8) add w3 = INCR, w3 C M I
- ADDSUB w0 = u0, v0 C M I
- ;;
-L(m67): st8 [rp] = w3, 8 C M23
- ld8 v0 = [vp], 8 C M01
- cmp.CND p6, p0 = w0, u0 C M I
- ld8 u0 = [up], 8 C M01
- ADDSUB w1 = u1, v1 C M I
- br.cloop.dptk L(top) C B
+.Loop: ld8 v1 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w1, u1 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
+ ld8 u1 = [up], 8 C M01
+ (p9) add w0 = INCR, w0 C M I
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w2, u2 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w1 C M I
+ ld8 u2 = [up], 8 C M01
+ (p6) add w1 = INCR, w1 C M I
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ st8 [rp] = w0, 8 C M23
+ ld8 v3 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w3, u3 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
+ ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+.LL000: st8 [rp] = w1, 16 C M23
+ st8 [rpx] = w2, 32 C M23
+ (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
+ lfetch [r10], 64
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w4 = u4, v4 C M I
+ ;;
+.LL11x: st8 [rp] = w3, 8 C M23
+ ld8 v4 = [vp], 8 C M01
+ cmp.PRED p6, p0 = w4, u4 C M I
+ ld8 u4 = [up], 8 C M01
+ ADDSUB w5 = u5, v5 C M I
+ ;;
+ ld8 v5 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w5, u5 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w4 C M I
+ ld8 u5 = [up], 8 C M01
+ (p9) add w4 = INCR, w4 C M I
+ ADDSUB w6 = u6, v6 C M I
+ ;;
+.LL101: ld8 v6 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w6, u6 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w5 C M I
+ ld8 u6 = [up], 8 C M01
+ (p6) add w5 = INCR, w5 C M I
+ ADDSUB w7 = u7, v7 C M I
+ ;;
+ st8 [rp] = w4, 8 C M23
+ ld8 v7 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w7, u7 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
+ ld8 u7 = [up], 8 C M01
+ (p7) add w6 = INCR, w6 C M I
+ ;;
+.LL100: st8 [rp] = w5, 16 C M23
+ st8 [rpx] = w6, 32 C M23
+ (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
+ lfetch [r11], 64
+ (p8) add w7 = INCR, w7 C M I
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+.LL01x: st8 [rp] = w7, 8 C M23
+ ld8 v0 = [vp], 8 C M01
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ br.cloop.dptk .Loop C B
;;
C *** MAIN LOOP END ***
-L(end):
-.mmi;
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- (p9) add w0 = INCR, w0 C M I
- mov ar.lc = r2 C I0
-L(cj5):
-.mmi; cmp.CND p7, p0 = w1, u1 C M I
- ADDSUB w2 = u2, v2 C M I
- nop 0
- ;;
-.mmi; st8 [rp] = w0, 8 C M23
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- (p6) add w1 = INCR, w1 C M I
-L(cj4):
-.mmi; cmp.CND p8, p0 = w2, u2 C M I
- ADDSUB w3 = u3, v3 C M I
- nop 0
- ;;
-.mmi; st8 [rp] = w1, 8 C M23
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
- (p7) add w2 = INCR, w2 C M I
-L(cj3):
-.mmi; cmp.CND p9, p0 = w3, u3 C M I
- ADDSUB w0 = u0, v0 C M I
- nop 0
- ;;
-.mmi; st8 [rp] = w2, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- (p8) add w3 = INCR, w3 C M I
-.mmi; cmp.CND p6, p0 = w0, u0 C M I
- nop 0
- mov r8 = 0 C M I
- ;;
-L(cj2):
-.mmi; st8 [rp] = w3, 8 C M23
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- (p9) add w0 = INCR, w0 C M I
- ;;
-L(cj1):
-.mmb; st8 [rp] = w0, 8 C M23
- (p6) mov r8 = 1 C M I
- br.ret.sptk.many b0 C B
+ cmp.PRED p7, p0 = w1, u1 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+.Lcj9: cmp.PRED p8, p0 = w2, u2 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w1 C M I
+ st8 [rp] = w0, 8 C M23
+ (p6) add w1 = INCR, w1 C M I
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ cmp.PRED p9, p0 = w3, u3 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+.Lcj8: st8 [rp] = w1, 16 C M23
+ st8 [rpx] = w2, 32 C M23
+ (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w4 = u4, v4 C M I
+ ;;
+.Lcj67: st8 [rp] = w3, 8 C M23
+ cmp.PRED p6, p0 = w4, u4 C M I
+ ADDSUB w5 = u5, v5 C M I
+ ;;
+ cmp.PRED p7, p0 = w5, u5 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w4 C M I
+ (p9) add w4 = INCR, w4 C M I
+ ADDSUB w6 = u6, v6 C M I
+ ;;
+.Lcj5: cmp.PRED p8, p0 = w6, u6 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w5 C M I
+ st8 [rp] = w4, 8 C M23
+ (p6) add w5 = INCR, w5 C M I
+ ADDSUB w7 = u7, v7 C M I
+ ;;
+.Lcj4: cmp.PRED p9, p0 = w7, u7 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
+ (p7) add w6 = INCR, w6 C M I
+ ;;
+ st8 [rp] = w5, 16 C M23
+ st8 [rpx] = w6, 32 C M23
+.Lcj3:
+ (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
+ (p8) add w7 = INCR, w7 C M I
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+.Lcj2: st8 [rp] = w7, 8 C M23
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ;;
+ (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ mov r8 = 0 C M I
+ ;;
+.Lcj1: st8 [rp] = w0, 8 C M23
+ mov.i ar.lc = r2 C I0
+ (p6) mov r8 = 1 C M I
+ br.ret.sptk.many b0 C B
EPILOGUE()
ASM_END()
diff --git a/gmp/mpn/ia64/aorslsh1_n.asm b/gmp/mpn/ia64/aorslsh1_n.asm
new file mode 100644
index 0000000000..5348149c87
--- /dev/null
+++ b/gmp/mpn/ia64/aorslsh1_n.asm
@@ -0,0 +1,323 @@
+dnl IA-64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 3.0
+C Itanium 2: 1.5
+
+C TODO
+C * Use shladd in feed-in code (for mpn_addlsh1_n).
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`vp',`r34')
+define(`n',`r35')
+
+ifdef(`OPERATION_addlsh1_n',`
+ define(ADDSUB, add)
+ define(PRED, ltu)
+ define(INCR, 1)
+ define(LIM, -1)
+ define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+ define(ADDSUB, sub)
+ define(PRED, gtu)
+ define(INCR, -1)
+ define(LIM, 0)
+ define(func, mpn_sublsh1_n)
+')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
+define(`x0',`r30') define(`x1',`r31') define(`x2',`r30') define(`x3',`r31')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
+ ;;
+')
+ {.mmi; ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov.i r2 = ar.lc C I0
+}{.mmi; and r14 = 3, n C M I
+ cmp.lt p15, p0 = 4, n C M I
+ add n = -4, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+.Lb00: ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ add x3 = r11, r11 C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = r10, x3 C M I
+ (p15) br.dpnt .grt4 C B
+ ;;
+ shrp x0 = v0, r11, 63 C I0
+ cmp.PRED p8, p0 = w3, r10 C M I
+ ;;
+ shrp x1 = v1, v0, 63 C I0
+ ADDSUB w0 = u0, x0 C M I
+ ;;
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ADDSUB w1 = u1, x1 C M I
+ br .Lcj4 C B
+
+.grt4: ld8 v3 = [vp], 8 C M01
+ shrp x0 = v0, r11, 63 C I0
+ cmp.PRED p8, p0 = w3, r10 C M I
+ add n = -1, n
+ ;;
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ shrp x1 = v1, v0, 63 C I0
+ ld8 v0 = [vp], 8 C M01
+ ADDSUB w0 = u0, x0 C M I
+ ;;
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, x1 C M I
+ br .LL00 C B
+
+.Lb01: add x2 = r11, r11 C M I
+ shr.u n = n, 2 C I0
+ (p15) br.dpnt .grt1 C B
+ ;;
+ ADDSUB w2 = r10, x2 C M I
+ shr.u r8 = r11, 63 C retval I0
+ ;;
+ cmp.PRED p6, p0 = w2, r10 C M I
+ ;;
+ st8 [rp] = w2, 8 C M23
+ (p6) add r8 = 1, r8 C M I
+ br.ret.sptk.many b0 C B
+
+.grt1: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C FIXME swap with next I0
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w2 = r10, x2
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, r11, 63 C I0
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.PRED p6, p0 = w2, r10 C M I
+ ADDSUB w3 = u3, x3 C M I
+ br.cloop.dptk .grt5 C B
+ ;;
+ shrp x0 = v0, v3, 63 C I0
+ cmp.PRED p8, p0 = w3, u3 C M I
+ br .Lcj5 C B
+
+.grt5: shrp x0 = v0, v3, 63 C I0
+ ld8 v3 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w3, u3 C M I
+ br .LL01 C B
+
+.Lb10: ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ add x1 = r11, r11 C M I
+ (p15) br.dpnt .grt2 C B
+ ;;
+ ADDSUB w1 = r10, x1 C M I
+ shrp x2 = v2, r11, 63 C I0
+ ;;
+ cmp.PRED p8, p0 = w1, r10 C M I
+ ADDSUB w2 = u2, x2 C M I
+ shr.u r8 = v2, 63 C retval I0
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ br .Lcj2 C B
+
+.grt2: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = r10, x1 C M I
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ shrp x2 = v2, r11, 63 C I0
+ cmp.PRED p8, p0 = w1, r10 C M I
+ ;;
+ ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, v2, 63 C I0
+ ld8 v2 = [vp], 8 C M01
+ ADDSUB w2 = u2, x2 C M I
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, x3 C M I
+ br.cloop.dpnt .Loop C B
+ br .Lskip C B
+
+.Lb11: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ add x0 = r11, r11 C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ (p15) br.dpnt .grt3 C B
+ ;;
+
+ shrp x1 = v1, r11, 63 C I0
+ ADDSUB w0 = r10, x0 C M I
+ ;;
+ cmp.PRED p6, p0 = w0, r10 C M I
+ ADDSUB w1 = u1, x1 C M I
+ ;;
+ shrp x2 = v2, v1, 63 C I0
+ cmp.PRED p8, p0 = w1, u1 C M I
+ br .Lcj3 C B
+
+.grt3: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ shrp x1 = v1, r11, 63 C I0
+ ADDSUB w0 = r10, x0 C M I
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ cmp.PRED p6, p0 = w0, r10 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, x1 C M I
+ ;;
+ shrp x2 = v2, v1, 63 C I0
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w1, u1 C M I
+ br .LL11 C B
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Loop: st8 [rp] = w1, 8 C M23
+ shrp x0 = v0, v3, 63 C I0
+ (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p8) add w2 = INCR, w2 C M I
+ ld8 v3 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w3, u3 C M I
+ ;;
+.LL01: ld8 u3 = [up], 8 C M01
+ shrp x1 = v1, v0, 63 C I0
+ (p6) cmp.eq.or p8, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ld8 v0 = [vp], 8 C M01
+ ADDSUB w0 = u0, x0 C M I
+ ;;
+ st8 [rp] = w2, 8 C M23
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, x1 C M I
+ ;;
+.LL00: st8 [rp] = w3, 8 C M23
+ shrp x2 = v2, v1, 63 C I0
+ (p8) cmp.eq.or p6, p0 = LIM, w0 C M I
+ (p8) add w0 = INCR, w0 C M I
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p8, p0 = w1, u1 C M I
+ ;;
+.LL11: ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, v2, 63 C I0
+ (p6) cmp.eq.or p8, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+ ld8 v2 = [vp], 8 C M01
+ ADDSUB w2 = u2, x2 C M I
+ ;;
+ st8 [rp] = w0, 8 C M23
+ cmp.PRED p6, p0 = w2, u2 C M I
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, x3 C M I
+ br.cloop.dptk .Loop C B
+ ;;
+C *** MAIN LOOP END ***
+
+.Lskip: st8 [rp] = w1, 8 C M23
+ shrp x0 = v0, v3, 63 C I0
+ (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p8) add w2 = INCR, w2 C M I
+ cmp.PRED p8, p0 = w3, u3 C M I
+ ;;
+.Lcj5: shrp x1 = v1, v0, 63 C I0
+ (p6) cmp.eq.or p8, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, x0 C M I
+ ;;
+ st8 [rp] = w2, 8 C M23
+ cmp.PRED p6, p0 = w0, u0 C M I
+ ADDSUB w1 = u1, x1 C M I
+ ;;
+.Lcj4: st8 [rp] = w3, 8 C M23
+ shrp x2 = v2, v1, 63 C I0
+ (p8) cmp.eq.or p6, p0 = LIM, w0 C M I
+ (p8) add w0 = INCR, w0 C M I
+ cmp.PRED p8, p0 = w1, u1 C M I
+ ;;
+.Lcj3: shr.u r8 = v2, 63 C I0
+ (p6) cmp.eq.or p8, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+ ADDSUB w2 = u2, x2 C M I
+ ;;
+ st8 [rp] = w0, 8 C M23
+ cmp.PRED p6, p0 = w2, u2 C M I
+ ;;
+.Lcj2: st8 [rp] = w1, 8 C M23
+ (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p8) add w2 = INCR, w2 C M I
+ ;;
+.Lcj1: st8 [rp] = w2, 8 C M23
+ mov.i ar.lc = r2 C I0
+ (p6) add r8 = 1, r8 C M I
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/ia64/aorsorrlsh1_n.asm b/gmp/mpn/ia64/aorsorrlsh1_n.asm
deleted file mode 100644
index 9b58b9e11f..0000000000
--- a/gmp/mpn/ia64/aorsorrlsh1_n.asm
+++ /dev/null
@@ -1,48 +0,0 @@
-dnl IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: 3.0
-C Itanium 2: 1.5
-
-
-define(LSH, 1)
-
-ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
-
-include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/ia64/aorsorrlsh2_n.asm b/gmp/mpn/ia64/aorsorrlsh2_n.asm
deleted file mode 100644
index 39b384a91b..0000000000
--- a/gmp/mpn/ia64/aorsorrlsh2_n.asm
+++ /dev/null
@@ -1,48 +0,0 @@
-dnl IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: 3.0
-C Itanium 2: 1.5
-
-
-define(LSH, 2)
-
-ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
-
-include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/ia64/aorsorrlshC_n.asm b/gmp/mpn/ia64/aorsorrlshC_n.asm
deleted file mode 100644
index d327838402..0000000000
--- a/gmp/mpn/ia64/aorsorrlshC_n.asm
+++ /dev/null
@@ -1,397 +0,0 @@
-dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 1.5
-
-C TODO
-C * Use shladd in feed-in code (for mpn_addlshC_n).
-C * Rewrite loop to schedule loads closer to use, since we do prefetch.
-
-C INPUT PARAMETERS
-define(`rp', `r32')
-define(`up', `r33')
-define(`vp', `r34')
-define(`n', `r35')
-
-ifdef(`DO_add', `
- define(`ADDSUB', `add $1 = $2, $3')
- define(`CMP', `cmp.ltu $1,p0 = $2, $3')
- define(`INCR', 1)
- define(`LIM', -1)
- define(`func', mpn_addlsh`'LSH`'_n)')
-ifdef(`DO_sub', `
- define(`ADDSUB', `sub $1 = $2, $3')
- define(`CMP', `cmp.gtu $1,p0 = $2, $3')
- define(`INCR', -1)
- define(`LIM', 0)
- define(`func', mpn_sublsh`'LSH`'_n)')
-ifdef(`DO_rsb', `
- define(`ADDSUB', `sub $1 = $3, $2')
- define(`CMP', `cmp.gtu $1,p0 = $2, $4')
- define(`INCR', -1)
- define(`LIM', 0)
- define(`func', mpn_rsblsh`'LSH`'_n)')
-
-define(cmpeqor, `cmp.eq.or')
-define(PFDIST, 500)
-
-define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
-define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
-define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
-define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
-define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9')
-
-C r3 r8 r9 r10 r11
-
-ASM_START()
-PROLOGUE(func)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
- addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- nop.i 0
- addp4 vp = 0, vp C M I
- nop.m 0
- zxt4 n = n C I
- ;;
-')
- {.mmi; ld8 r11 = [vp], 8 C M01
- ld8 r10 = [up], 8 C M01
- mov.i r2 = ar.lc C I0
-}{.mmi; and r14 = 3, n C M I
- cmp.lt p15, p0 = 4, n C M I
- add n = -5, n C M I
- ;;
-}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
- cmp.eq p7, p0 = 2, r14 C M I
- cmp.eq p8, p0 = 3, r14 C M I
-}{.bbb
- (p6) br.dptk .Lb01 C B
- (p7) br.dptk .Lb10 C B
- (p8) br.dptk .Lb11 C B
-}
-
-.Lb00: ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- shr.u n = n, 2 C I0
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shl x3 = r11, LSH C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shrp x0 = v0, r11, 64-LSH C I0
-.mmb; ADDSUB( w3, r10, x3) C M I
- nop 0
- (p15) br.dpnt .grt4 C B
- ;;
-.mii; CMP( p7, w3, r10, x3) C M II0
- shrp x1 = v1, v0, 64-LSH C I0
- ADDSUB( w0, u0, x0) C M I
- ;;
-.mii; CMP( p8, w0, u0, x0) C M I
- shrp x2 = v2, v1, 64-LSH C I0
- ADDSUB( w1, u1, x1) C M I
-.mmb; nop 0
- nop 0
- br .Lcj4 C B
-
-ALIGN(32)
-.grt4: ld8 v3 = [vp], 8 C M01
- shrp x0 = v0, r11, 64-LSH C I0
- CMP( p8, w3, r10, x3) C M I
- ;;
-.mmi; ld8 u3 = [up], 8 C M01
- add r11 = PFDIST, vp
- shrp x1 = v1, v0, 64-LSH C I0
-.mmi; ld8 v0 = [vp], 8 C M01
- ADDSUB( w0, u0, x0) C M I
- nop 0
- ;;
-.mmi; CMP( p6, w0, u0, x0) C M I
- add r10 = PFDIST, up
- mov.i ar.lc = n C I0
-.mmb; ADDSUB( w1, u1, x1) C M I
- ld8 u0 = [up], 8 C M01
- br .LL00 C B
-
-
- ALIGN(32)
-.Lb01:
-ifdef(`DO_add',
-` shladd w2 = r11, LSH, r10 C M I
- shr.u r8 = r11, 64-LSH C retval I0
- (p15) br.dpnt .grt1 C B
- ;;
-',`
- shl x2 = r11, LSH C I0
- (p15) br.dpnt .grt1 C B
- ;;
- ADDSUB( w2, r10, x2) C M I
- shr.u r8 = r11, 64-LSH C retval I0
- ;;
-')
- CMP( p6, w2, r10, x2) C M I
- br .Lcj1
-
-.grt1: ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shr.u n = n, 2 C I0
- ;;
- ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- mov.i ar.lc = n C FIXME swap with next I0
-ifdef(`DO_add',
-`',`
- ADDSUB( w2, r10, x2)
-')
- ;;
-.mmi; ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shrp x3 = v3, r11, 64-LSH C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shrp x0 = v0, v3, 64-LSH C I0
-.mmb; CMP( p6, w2, r10, x2) C M I
- ADDSUB( w3, u3, x3) C M I
- br.cloop.dptk .grt5 C B
- ;;
-.mmi; CMP( p7, w3, u3, x3) C M I
- ADDSUB( w0, u0, x0) C M I
- shrp x1 = v1, v0, 64-LSH C I0
-.mmb; nop 0
- nop 0
- br .Lcj5 C B
-.grt5:
-.mmi; add r10 = PFDIST, up
- add r11 = PFDIST, vp
- shrp x0 = v0, v3, 64-LSH C I0
-.mmb; ld8 v3 = [vp], 8 C M01
- CMP( p8, w3, u3, x3) C M I
- br .LL01 C B
-
- ALIGN(32)
-.Lb10: ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shl x1 = r11, LSH C I0
-.mmb; nop 0
- nop 0
- (p15) br.dpnt .grt2 C B
- ;;
-.mmi; ADDSUB( w1, r10, x1) C M I
- nop 0
- shrp x2 = v2, r11, 64-LSH C I0
- ;;
-.mmi; CMP( p9, w1, r10, x1) C M I
- ADDSUB( w2, u2, x2) C M I
- shr.u r8 = v2, 64-LSH C retval I0
- ;;
-.mmb; CMP( p6, w2, u2, x2) C M I
- nop 0
- br .Lcj2 C B
-
-.grt2: ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shr.u n = n, 2 C I0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- ld8 u0 = [up], 8 C M01
- mov.i ar.lc = n C I0
-.mmi; ADDSUB( w1, r10, x1) C M I
- nop 0
- nop 0
- ;;
-.mii; ld8 v1 = [vp], 8 C M01
- shrp x2 = v2, r11, 64-LSH C I0
- CMP( p8, w1, r10, x1) C M I
- ;;
-.mmi; add r10 = PFDIST, up
- ld8 u1 = [up], 8 C M01
- shrp x3 = v3, v2, 64-LSH C I0
-.mmi; add r11 = PFDIST, vp
- ld8 v2 = [vp], 8 C M01
- ADDSUB( w2, u2, x2) C M I
- ;;
-.mmi; CMP( p6, w2, u2, x2) C M I
- ld8 u2 = [up], 8 C M01
- shrp x0 = v0, v3, 64-LSH C I0
-.mbb; ADDSUB( w3, u3, x3) C M I
- br.cloop.dpnt L(top) C B
- br L(end) C B
-
-.Lb11: ld8 v1 = [vp], 8 C M01
- ld8 u1 = [up], 8 C M01
- shl x0 = r11, LSH C I0
- ;;
-.mmi; ld8 v2 = [vp], 8 C M01
- ld8 u2 = [up], 8 C M01
- shr.u n = n, 2 C I0
-.mmb; nop 0
- nop 0
- (p15) br.dpnt .grt3 C B
- ;;
-.mii; nop 0
- shrp x1 = v1, r11, 64-LSH C I0
- ADDSUB( w0, r10, x0) C M I
- ;;
-.mii; CMP( p8, w0, r10, x0) C M I
- shrp x2 = v2, v1, 64-LSH C I0
- ADDSUB( w1, u1, x1) C M I
- ;;
-.mmb; CMP( p9, w1, u1, x1) C M I
- ADDSUB( w2, u2, x2) C M I
- br .Lcj3 C B
-.grt3:
-.mmi; ld8 v3 = [vp], 8 C M01
- ld8 u3 = [up], 8 C M01
- shrp x1 = v1, r11, 64-LSH C I0
-.mmi; ADDSUB( w0, r10, x0) C M I
- nop 0
- nop 0
- ;;
-.mmi; ld8 v0 = [vp], 8 C M01
- CMP( p6, w0, r10, x0) C M I
- mov.i ar.lc = n C I0
-.mmi; ld8 u0 = [up], 8 C M01
- ADDSUB( w1, u1, x1) C M I
- nop 0
- ;;
-.mmi; add r10 = PFDIST, up
- add r11 = PFDIST, vp
- shrp x2 = v2, v1, 64-LSH C I0
-.mmb; ld8 v1 = [vp], 8 C M01
- CMP( p8, w1, u1, x1) C M I
- br .LL11 C B
-
-
-C *** MAIN LOOP START ***
- ALIGN(32)
-L(top): st8 [rp] = w1, 8 C M23
- lfetch [r10], 32
- (p8) cmpeqor p6, p0 = LIM, w2 C M I
- (p8) add w2 = INCR, w2 C M I
- ld8 v3 = [vp], 8 C M01
- CMP( p8, w3, u3, x3) C M I
- ;;
-.LL01: ld8 u3 = [up], 8 C M01
- shrp x1 = v1, v0, 64-LSH C I0
- (p6) cmpeqor p8, p0 = LIM, w3 C M I
- (p6) add w3 = INCR, w3 C M I
- ld8 v0 = [vp], 8 C M01
- ADDSUB( w0, u0, x0) C M I
- ;;
- st8 [rp] = w2, 8 C M23
- CMP( p6, w0, u0, x0) C M I
- nop.b 0
- ld8 u0 = [up], 8 C M01
- lfetch [r11], 32
- ADDSUB( w1, u1, x1) C M I
- ;;
-.LL00: st8 [rp] = w3, 8 C M23
- shrp x2 = v2, v1, 64-LSH C I0
- (p8) cmpeqor p6, p0 = LIM, w0 C M I
- (p8) add w0 = INCR, w0 C M I
- ld8 v1 = [vp], 8 C M01
- CMP( p8, w1, u1, x1) C M I
- ;;
-.LL11: ld8 u1 = [up], 8 C M01
- shrp x3 = v3, v2, 64-LSH C I0
- (p6) cmpeqor p8, p0 = LIM, w1 C M I
- (p6) add w1 = INCR, w1 C M I
- ld8 v2 = [vp], 8 C M01
- ADDSUB( w2, u2, x2) C M I
- ;;
-.mmi; st8 [rp] = w0, 8 C M23
- CMP( p6, w2, u2, x2) C M I
- shrp x0 = v0, v3, 64-LSH C I0
- ld8 u2 = [up], 8 C M01
- ADDSUB( w3, u3, x3) C M I
- br.cloop.dptk L(top) C B
- ;;
-C *** MAIN LOOP END ***
-
-L(end):
-.mmi; st8 [rp] = w1, 8 C M23
- (p8) cmpeqor p6, p0 = LIM, w2 C M I
- shrp x1 = v1, v0, 64-LSH C I0
-.mmi;
- (p8) add w2 = INCR, w2 C M I
- CMP( p7, w3, u3, x3) C M I
- ADDSUB( w0, u0, x0) C M I
- ;;
-.Lcj5:
-.mmi; st8 [rp] = w2, 8 C M23
- (p6) cmpeqor p7, p0 = LIM, w3 C M I
- shrp x2 = v2, v1, 64-LSH C I0
-.mmi;
- (p6) add w3 = INCR, w3 C M I
- CMP( p8, w0, u0, x0) C M I
- ADDSUB( w1, u1, x1) C M I
- ;;
-.Lcj4:
-.mmi; st8 [rp] = w3, 8 C M23
- (p7) cmpeqor p8, p0 = LIM, w0 C M I
- mov.i ar.lc = r2 C I0
-.mmi;
- (p7) add w0 = INCR, w0 C M I
- CMP( p9, w1, u1, x1) C M I
- ADDSUB( w2, u2, x2) C M I
- ;;
-.Lcj3:
-.mmi; st8 [rp] = w0, 8 C M23
- (p8) cmpeqor p9, p0 = LIM, w1 C M I
- shr.u r8 = v2, 64-LSH C I0
-.mmi;
- (p8) add w1 = INCR, w1 C M I
- CMP( p6, w2, u2, x2) C M I
- nop 0
- ;;
-.Lcj2:
-.mmi; st8 [rp] = w1, 8 C M23
- (p9) cmpeqor p6, p0 = LIM, w2 C M I
- (p9) add w2 = INCR, w2 C M I
- ;;
-.Lcj1:
-.mmb; st8 [rp] = w2 C M23
-ifdef(`DO_rsb',`
- (p6) add r8 = -1, r8 C M I
-',`
- (p6) add r8 = 1, r8 C M I
-') br.ret.sptk.many b0 C B
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/ia64/bdiv_dbm1c.asm b/gmp/mpn/ia64/bdiv_dbm1c.asm
index 47e4553cda..6ff4fdaaf9 100644
--- a/gmp/mpn/ia64/bdiv_dbm1c.asm
+++ b/gmp/mpn/ia64/bdiv_dbm1c.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_bdiv_dbm1.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
dnl Copyright 2008, 2009 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/cnd_aors_n.asm b/gmp/mpn/ia64/cnd_aors_n.asm
deleted file mode 100644
index dc4a937403..0000000000
--- a/gmp/mpn/ia64/cnd_aors_n.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-dnl IA-64 mpn_cnd_add_n/mpn_cnd_sub_n.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 1.5
-
-C INPUT PARAMETERS
-define(`cnd', `r32')
-define(`rp', `r33')
-define(`up', `r34')
-define(`vp', `r35')
-define(`n', `r36')
-
-ifdef(`OPERATION_cnd_add_n',`
- define(ADDSUB, add)
- define(CND, ltu)
- define(INCR, 1)
- define(LIM, -1)
- define(func, mpn_cnd_add_n)
-')
-ifdef(`OPERATION_cnd_sub_n',`
- define(ADDSUB, sub)
- define(CND, gtu)
- define(INCR, -1)
- define(LIM, 0)
- define(func, mpn_cnd_sub_n)
-')
-
-define(cmpeqor, `cmp.eq.or')
-define(PFDIST, 160)
-
-C Some useful aliases for registers we use
-define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
-define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23')
-define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
-define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
-define(`up1',`up') define(`up2',`r8') define(`upadv',`r1')
-define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11')
-define(`rp1',`rp') define(`rp2',`r10')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
- addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- nop.i 0
- addp4 vp = 0, vp C M I
- nop.m 0
- zxt4 n = n C I
- ;;
-')
-.mmi; and r3 = 3, n C M I
- add n = -1, n C M I
- mov r2 = ar.lc C I0
-.mmi; cmp.ne p6, p7 = 0, cnd C M I
- add vp2 = 8, vp C M I
- add up2 = 8, up C M I
- ;;
-.mmi; add upadv = PFDIST, up C M I
- add vpadv = PFDIST, vp C M I
- shr.u n = n, 2 C I0
- .pred.rel "mutex", p6, p7
-.mmi; add rp2 = 8, rp C M I
- (p6) mov cnd = -1 C M I
- (p7) mov cnd = 0 C M I
- ;;
- cmp.eq p9, p0 = 1, r3 C M I
- cmp.eq p7, p0 = 2, r3 C M I
- cmp.eq p8, p0 = 3, r3 C M I
- (p9) br L(b1) C B
- (p7) br L(b2) C B
- (p8) br L(b3) C B
- ;;
-L(b0):
-.mmi; ld8 v2 = [vp1], 16 C M01
- ld8 v3 = [vp2], 16 C M01
- mov ar.lc = n C I0
- ;;
- ld8 u2 = [up1], 16 C M01
- ld8 u3 = [up2], 16 C M01
- and x2 = v2, cnd C M I
- and x3 = v3, cnd C M I
- ;;
- ADDSUB w2 = u2, x2 C M I
- ADDSUB w3 = u3, x3 C M I
- ;;
- ld8 v0 = [vp1], 16 C M01
- ld8 v1 = [vp2], 16 C M01
- cmp.CND p8, p0 = w2, u2 C M I
- cmp.CND p9, p0 = w3, u3 C M I
- br L(lo0)
-
-L(b1): ld8 v1 = [vp1], 8 C M01
- add vp2 = 8, vp2 C M I
- add rp2 = 8, rp2 C M I
- ;;
- ld8 u1 = [up1], 8 C M01
- add up2 = 8, up2 C M I
- and x1 = v1, cnd C M I
- ;;
- ADDSUB w1 = u1, x1 C M I
- cmp.ne p10, p0 = 0, n
- add n = -1, n
- ;;
- cmp.CND p7, p0 = w1, u1 C M I
- st8 [rp1] = w1, 8 C M23
- (p10) br L(b0)
- ;;
- mov r8 = 0 C M I
- br L(e1)
-
-L(b3): ld8 v3 = [vp1], 8 C M01
- add vp2 = 8, vp2 C M I
- add rp2 = 8, rp2 C M I
- ;;
- ld8 u3 = [up1], 8 C M01
- add up2 = 8, up2 C M I
- and x3 = v3, cnd C M I
- ;;
- ADDSUB w3 = u3, x3 C M I
- ;;
- cmp.CND p9, p0 = w3, u3 C M I
- st8 [rp1] = w3, 8 C M23
- C fall through
-
-L(b2):
-.mmi; ld8 v0 = [vp1], 16 C M01
- ld8 v1 = [vp2], 16 C M01
- mov ar.lc = n C I0
- ;;
- ld8 u0 = [up1], 16 C M01
- ld8 u1 = [up2], 16 C M01
- and x0 = v0, cnd C M I
- and x1 = v1, cnd C M I
- ;;
- ADDSUB w0 = u0, x0 C M I
- ADDSUB w1 = u1, x1 C M I
- br.cloop.dptk L(gt2) C B
- ;;
- cmp.CND p6, p0 = w0, u0 C M I
- br L(e2) C B
-L(gt2):
- ld8 v2 = [vp1], 16 C M01
- ld8 v3 = [vp2], 16 C M01
- cmp.CND p6, p0 = w0, u0 C M I
- cmp.CND p7, p0 = w1, u1 C M I
- br L(lo2) C B
-
-
-C *** MAIN LOOP START ***
-C ALIGN(32)
-L(top):
-.mmi; ld8 v2 = [vp1], 16 C M01
- ld8 v3 = [vp2], 16 C M01
- cmp.CND p6, p0 = w0, u0 C M I
-.mmi; st8 [rp1] = w2, 16 C M23
- st8 [rp2] = w3, 16 C M23
- cmp.CND p7, p0 = w1, u1 C M I
- ;;
-L(lo2):
-.mmi; ld8 u2 = [up1], 16 C M01
- ld8 u3 = [up2], 16 C M01
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
-.mmi; and x2 = v2, cnd C M I
- and x3 = v3, cnd C M I
- (p9) add w0 = INCR, w0 C M I
- ;;
-.mmi; ADDSUB w2 = u2, x2 C M I
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- (p6) add w1 = INCR, w1 C M I
-.mmi; ADDSUB w3 = u3, x3 C M I
- lfetch [upadv], 32
- nop 0
- ;;
-.mmi; ld8 v0 = [vp1], 16 C M01
- ld8 v1 = [vp2], 16 C M01
- cmp.CND p8, p0 = w2, u2 C M I
-.mmi; st8 [rp1] = w0, 16 C M23
- st8 [rp2] = w1, 16 C M23
- cmp.CND p9, p0 = w3, u3 C M I
- ;;
-L(lo0):
-.mmi; ld8 u0 = [up1], 16 C M01
- ld8 u1 = [up2], 16 C M01
- (p7) cmpeqor p8, p0 = LIM, w2 C M I
-.mmi; and x0 = v0, cnd C M I
- and x1 = v1, cnd C M I
- (p7) add w2 = INCR, w2 C M I
- ;;
-.mmi; ADDSUB w0 = u0, x0 C M I
- (p8) cmpeqor p9, p0 = LIM, w3 C M I
- (p8) add w3 = INCR, w3 C M I
-.mmb; ADDSUB w1 = u1, x1 C M I
- lfetch [vpadv], 32
- br.cloop.dptk L(top) C B
- ;;
-C *** MAIN LOOP END ***
-
-
-L(end):
-.mmi; st8 [rp1] = w2, 16 C M23
- st8 [rp2] = w3, 16 C M23
- cmp.CND p6, p0 = w0, u0 C M I
- ;;
-L(e2):
-.mmi; cmp.CND p7, p0 = w1, u1 C M I
- (p9) cmpeqor p6, p0 = LIM, w0 C M I
- (p9) add w0 = INCR, w0 C M I
- ;;
-.mmi; mov r8 = 0 C M I
- (p6) cmpeqor p7, p0 = LIM, w1 C M I
- (p6) add w1 = INCR, w1 C M I
- ;;
-.mmi; st8 [rp1] = w0, 16 C M23
- st8 [rp2] = w1, 16 C M23
- mov ar.lc = r2 C I0
-L(e1):
-.mmb; nop 0
- (p7) mov r8 = 1 C M I
- br.ret.sptk.many b0 C B
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/ia64/copyd.asm b/gmp/mpn/ia64/copyd.asm
index b94a1af362..759629e4a7 100644
--- a/gmp/mpn/ia64/copyd.asm
+++ b/gmp/mpn/ia64/copyd.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_copyd -- copy limb vector, decrementing.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/copyi.asm b/gmp/mpn/ia64/copyi.asm
index 49ed192021..11451dc08d 100644
--- a/gmp/mpn/ia64/copyi.asm
+++ b/gmp/mpn/ia64/copyi.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_copyi -- copy limb vector, incrementing.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/dive_1.asm b/gmp/mpn/ia64/dive_1.asm
index 5e4a273530..9b9d085c0c 100644
--- a/gmp/mpn/ia64/dive_1.asm
+++ b/gmp/mpn/ia64/dive_1.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
-dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
-
-dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -49,7 +36,7 @@ C This code is a bit messy, and not as similar to mode1o.asm as desired.
C The critical path during initialization is for computing the inverse of the
C divisor. Since odd divisors are probably common, we conditionally execute
-C the initial count_trailing_zeros code and the downshift.
+C the initial count_traling_zeros code and the downshift.
C Possible improvement: Merge more of the feed-in code into the inverse
C computation.
@@ -190,28 +177,22 @@ ifdef(`HAVE_ABI_32',
ld8 r21 = [up], 8
br .Lent
-.Ltop: ld8 r21 = [up], 8
+.Loop: ld8 r21 = [up], 8
xma.l f12 = f9, f8, f10 C q = c * -inverse + si
- nop.b 0
;;
.Lent: add r16 = 160, up
shl r22 = r21, lshift
- nop.b 0
;;
stf8 [rp] = f12, 8
xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
- nop.b 0
- nop.m 0
xmpy.l f10 = f11, f7 C si = ulimb * inverse
- nop.b 0
;;
or r31 = r22, r23
shr.u r23 = r21, rshift
- nop.b 0
;;
lfetch [r16]
setf.sig f11 = r31
- br.cloop.sptk.few.clr .Ltop
+ br.cloop.sptk.few.clr .Loop
xma.l f12 = f9, f8, f10 C q = c * -inverse + si
diff --git a/gmp/mpn/ia64/divrem_1.asm b/gmp/mpn/ia64/divrem_1.asm
index e8878209db..aa50ac902b 100644
--- a/gmp/mpn/ia64/divrem_1.asm
+++ b/gmp/mpn/ia64/divrem_1.asm
@@ -1,35 +1,22 @@
dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
dnl unnormalized limb.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/divrem_2.asm b/gmp/mpn/ia64/divrem_2.asm
index 9864311278..da3e9d64b7 100644
--- a/gmp/mpn/ia64/divrem_2.asm
+++ b/gmp/mpn/ia64/divrem_2.asm
@@ -1,45 +1,59 @@
-dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+dnl IA-64 mpn_divrem_2 -- Divide an n-limb number by a 2-limb number.
-dnl Copyright 2010, 2013 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C norm frac
-C itanium 1
-C itanium 2 29 29
+C cycles/limb
+C Itanium: 63
+C Itanium 2: 46
C TODO
-C * Inline and interleave limb inversion code with loop setup code.
-C * We should use explicit bundling in much of the code, since it typically
-C cuts some cycles with the GNU assembler.
+C * Further optimize the loop. We could probably do some more trickery with
+C arithmetic in the FPU, or perhaps use a non-zero addend of xma in more
+C places.
+C * Software pipeline for perhaps 5 saved cycles, around the end and start of
+C the loop.
+C * Schedule code outside of loop better.
+C * Update the comments. They are now using the same name for the same
+C logical quantity.
+C * Handle conditional zeroing of r31 in loop more cleanly.
+C * Inline mpn_invert_limb and schedule its insns across the entire init code.
+C * Ultimately, use 2-limb, or perhaps 3-limb or 4-limb inverse.
+
+define(`qp',`r32')
+define(`qxn',`r33')
+define(`np',`r34')
+define(`nn',`r35')
+define(`dp',`r36')
+define(`fnh',`f11')
+define(`fminus1',`f10')
+define(`fd0',`f13')
+define(`fd1',`f14')
+define(`d0',`r39')
+define(`d1',`r36')
+define(`fnl',`f32')
+define(`fdinv',`f12')
+
+define(`R1',`r38') define(`R0',`r37')
+define(`P1',`r28') define(`P0',`r27')
ASM_START()
@@ -47,234 +61,204 @@ C HP's assembler requires these declarations for importing mpn_invert_limb
.global mpn_invert_limb
.type mpn_invert_limb,@function
-C INPUT PARAMETERS
-C qp = r32
-C fn = r33
-C np = r34
-C nn = r35
-C dp = r36
-
-define(`f0x1', `f15')
-
-ASM_START()
PROLOGUE(mpn_divrem_2)
.prologue
-ifdef(`HAVE_ABI_32',
-` addp4 r32 = 0, r32 C M I
- addp4 r34 = 0, r34 C M I
- zxt4 r35 = r35 C I
- addp4 r36 = 0, r36 C M I
- nop.m 0
- zxt4 r33 = r33 C I
- ;;
-')
.save ar.pfs, r42
- alloc r42 = ar.pfs, 5, 9, 1, 0
- shladd r34 = r35, 3, r34
- adds r14 = 8, r36
- mov r43 = r1
- ;;
- adds r15 = -8, r34
- ld8 r39 = [r14]
- .save ar.lc, r45
- mov r45 = ar.lc
- adds r14 = -16, r34
- mov r40 = r0
- adds r34 = -24, r34
- ;;
- ld8 r38 = [r15]
+ .save ar.lc, r44
.save rp, r41
- mov r41 = b0
- .body
- ld8 r36 = [r36]
- ld8 r37 = [r14]
- ;;
- cmp.gtu p6, p7 = r39, r38
- (p6) br.cond.dptk .L8
- ;;
- cmp.leu p8, p9 = r36, r37
- cmp.geu p6, p7 = r39, r38
- ;;
- (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
- (p7) br.cond.dptk .L51
-.L8:
- add r14 = r33, r35 // un + fn
- mov r46 = r39 // argument to mpn_invert_limb
- ;;
- adds r35 = -3, r14
- ;;
- cmp.gt p12, p0 = r0, r35
- (p12) br.cond.dpnt L(end)
- br.call.sptk.many b0 = mpn_invert_limb
- ;;
- setf.sig f11 = r8 // di (non-final)
- setf.sig f34 = r39 // d1
- setf.sig f33 = r36 // d0
- mov r1 = r43
- ;;
- mov r17 = 1
- setf.sig f9 = r38 // n2
- xma.l f6 = f11, f34, f0 // t0 = LO(di * d1)
- ;;
- setf.sig f10 = r37 // n1
- setf.sig f15 = r17 // 1
- xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0)
- ;;
- getf.sig r17 = f6
- getf.sig r16 = f8
- mov ar.lc = r35
- ;;
- sub r18 = r0, r39 // -d1
- add r14 = r17, r36
- ;;
- setf.sig f14 = r18 // -d1
- cmp.leu p8, p9 = r17, r14
- add r16 = r14, r16
- ;;
- (p9) adds r19 = 0, r0
- (p8) adds r19 = -1, r0
- cmp.gtu p6, p7 = r14, r16
- ;;
- (p6) adds r19 = 1, r19
- ;;
-ifelse(1,1,`
- cmp.gt p7, p6 = r0, r19
- ;;
- (p6) adds r8 = -1, r8 // di--
- (p6) sub r14 = r16, r39 // t0 -= d1
- (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
- ;;
- (p6) cmp.gt p9, p8 = 1, r19
- (p7) cmp.gt p9, p8 = 0, r19
- (p6) adds r19 = -1, r19 // t1 -= cy
- mov r16 = r14
- ;;
- (p8) adds r8 = -1, r8 // di--
- (p8) sub r14 = r16, r39 // t0 -= d1
- (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
- ;;
- (p8) cmp.gt p7, p6 = 1, r19
- (p9) cmp.gt p7, p6 = 0, r19
- (p8) adds r19 = -1, r19 // t1 -= cy
- mov r16 = r14
- ;;
- (p6) adds r8 = -1, r8 // di--
- (p6) sub r14 = r16, r39 // t0 -= d1
- (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
- ;;
- (p6) cmp.gt p9, p8 = 1, r19
- (p7) cmp.gt p9, p8 = 0, r19
- (p6) adds r19 = -1, r19 // t1 -= cy
- mov r16 = r14
- ;;
- (p8) adds r8 = -1, r8 // di--
- (p8) sub r14 = r16, r39 // t0 -= d1
- (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
- ;;
- (p8) adds r19 = -1, r19 // t1 -= cy
- mov r16 = r14
-',`
- cmp.gt p8, p9 = r0, r19
- (p8) br.cond.dpnt .L46
-.L52:
- cmp.leu p6, p7 = r39, r16
- sub r14 = r16, r39
- adds r8 = -1, r8
- ;;
- (p7) adds r19 = -1, r19
- mov r16 = r14
+ifdef(`HAVE_ABI_32',
+` addp4 qp = 0, qp C M I
+ addp4 np = 0, np C M I
+ addp4 dp = 0, dp C M I
+ zxt4 nn = nn C I
+ zxt4 qxn = qxn C I
;;
- (p7) cmp.gt p8, p9 = r0, r19
- (p9) br.cond.dptk .L52
-.L46:
')
- setf.sig f32 = r8 // di
- shladd r32 = r35, 3, r32
- ;;
- ALIGN(16)
-L(top): nop 0
- nop 0
- cmp.gt p8, p9 = r33, r35
- ;;
- (p8) mov r37 = r0
- (p9) ld8 r37 = [r34], -8
- xma.hu f8 = f9, f32, f10 // 0,29
- xma.l f12 = f9, f32, f10 // 0
- ;;
- getf.sig r20 = f12 // q0 4
- xma.l f13 = f15, f8, f9 // q += n2 4
- sub r8 = -1, r36 // bitnot d0
- ;;
- getf.sig r18 = f13 // 8
- xma.l f7 = f14, f13, f10 // 8
- xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8
- xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9
- ;;
- getf.sig r38 = f7 // n1 12
- getf.sig r16 = f6 // 13
- getf.sig r19 = f9 // 14
- ;;
- sub r38 = r38, r39 // n1 -= d1 17
- ;;
- cmp.ne p9, p0 = r0, r0 // clear p9
- cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18
- ;;
- sub r37 = r37, r16 // n0 -= t0 19
- (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19
- (p10) sub r38 = r38, r19 // n1 -= t1 19
- ;;
- cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20
- ;;
- (p7) cmp.ltu p9, p0 = r8, r37 // 21
- (p6) add r18 = 1, r18 //
- (p7) add r37 = r37, r36 // 21
- (p7) add r38 = r38, r39 // 21
+ alloc r42 = ar.pfs, 5,8,1,0 C M2
+ ld8 d0 = [dp], 8 C M0M1 d0
+ mov r44 = ar.lc C I0
+ shladd np = nn, 3, np C M I
+ ;;
+ ld8 d1 = [dp] C M0M1 d1
+ mov r41 = b0 C I0
+ add r15 = -8, np C M I
+ add np = -16, np C M I
+ mov r40 = r0 C M I
+ ;;
+ ld8 R1 = [r15] C M0M1 n1
+ ld8 R0 = [r34], -8 C M0M1 n0
+ ;;
+ cmp.ltu p6, p0 = d1, R1 C M I
+ cmp.eq p8, p0 = d1, R1 C M I
+ ;;
+ (p8) cmp.leu p6, p0 = d0, R0
+ cmp.ltu p8, p9 = R0, d0
+ (p6) br.cond.dpnt .L_high_limb_1 C FIXME: inline!
+.L8:
+
+ mov r45 = d1
+ br.call.sptk.many b0 = mpn_invert_limb C FIXME: inline+schedule
;;
- setf.sig f10 = r37 // n1 22
- (p9) add r38 = 1, r38 // 22
+ setf.sig fd1 = d1 C d1
+ setf.sig fd0 = d0 C d0
+ add r14 = r33, r35 C nn + qxn
;;
- setf.sig f9 = r38 // n2 23
- cmp.gtu p6, p7 = r39, r38 // 23
- (p7) br.cond.spnt L(fix)
-L(bck): st8 [r32] = r18, -8
- adds r35 = -1, r35
- br.cloop.sptk.few L(top)
+ setf.sig fdinv = r8 C dinv
+ mov r9 = -1
+ add r35 = -3, r14
;;
-
-L(end): add r14 = 8, r34
- add r15 = 16, r34
- mov b0 = r41
+ setf.sig fminus1 = r9
+ cmp.gt p6, p0 = r0, r35
+ shladd qp = r35, 3, qp
+ mov ar.lc = r35
+ mov r31 = 0 C n0
+ (p6) br.cond.dpnt .Ldone
;;
- st8 [r14] = r37
- st8 [r15] = r38
- mov ar.pfs = r42
- mov r8 = r40
- mov ar.lc = r45
+ ALIGN(16)
+C *** MAIN LOOP START ***
+.Loop: C 00
+ mov r15 = R0 C nadj = n10
+ cmp.le p14, p15 = 0, R0 C check high bit of R0
+ cmp.le p8, p0 = r33, r35 C dividend limbs remaining?
+ ;; C 01
+ .pred.rel "mutex", p14, p15
+ (p8) ld8 r31 = [r34], -8 C n0
+ (p15) add r15 = d1, R0 C nadj = n10 + d1
+ (p15) add r14 = 1, R1 C nh + (nl:63)
+ (p14) mov r14 = R1 C nh
+ cmp.eq p6, p0 = d1, R1 C nh == d1
+ (p6) br.cond.spnt .L_R1_eq_d1
+ ;; C 02
+ setf.sig f8 = r14 C n2 + (nl:63)
+ setf.sig f15 = r15 C nadj
+ sub r23 = -1, R1 C r23 = ~nh
+ ;; C 03
+ setf.sig fnh = r23
+ setf.sig fnl = R0
+ ;; C 08
+ xma.hu f7 = fdinv, f8, f15 C xh = HI(dinv*(nh-nmask)+nadj)
+ ;; C 12
+ xma.l f7 = f7, fminus1, fnh C nh + xh
+ ;; C 16
+ getf.sig r14 = f7
+ xma.hu f9 = f7, fd1, fnl C xh = HI(q1*d1+nl)
+ xma.l f33 = f7, fd1, fnl C xh = LO(q1*d1+nl)
+ ;; C 20
+ getf.sig r16 = f9
+ sub r24 = d1, R1
+ C 21
+ getf.sig r17 = f33
+ ;; C 25
+ cmp.eq p6, p7 = r16, r24
+ ;; C 26
+ .pred.rel "mutex", p6, p7
+ (p6) xma.l f8 = f7, fminus1, f0 C f8 = -f7
+ (p7) xma.l f8 = f7,fminus1,fminus1 C f8 = -f7-1
+ ;; C 27
+ .pred.rel "mutex", p6, p7
+ (p6) sub r18 = 0, r14 C q = -q1
+ (p7) sub r18 = -1, r14 C q = -q1-1
+ (p6) add r14 = 0, r17 C n1 = xl
+ (p7) add r14 = d1, r17 C n1 = xl + d1
+ ;; C 30
+ xma.hu f9 = fd0, f8, f0 C d0*(-f7-1) = -d0*f7-d0
+ xma.l f35 = fd0, f8, f0
+ ;; C 34
+ getf.sig P1 = f9 C P1
+ C 35
+ getf.sig P0 = f35 C P0
+ ;;
+.L_adj: C 40
+ cmp.ltu p8, p0 = r31, P0 C p8 = cy from low limb
+ cmp.ltu p6, p0 = r14, P1 C p6 = prel cy from high limb
+ sub R0 = r31, P0
+ sub R1 = r14, P1
+ ;; C 41
+ (p8) cmp.eq.or p6, p0 = 0, R1 C p6 = final cy from high limb
+ (p8) add R1 = -1, R1
+ cmp.ne p10, p0 = r0, r0 C clear p10 FIXME: use unc below!
+ cmp.ne p13, p0 = r0, r0 C clear p13 FIXME: use unc below!
+ ;; C 42
+ (p6) add R0 = R0, d0
+ (p6) add R1 = R1, d1
+ (p6) add r18 = -1, r18 C q--
+ ;; C 43
+ (p6) cmp.ltu p10, p0 = R0, d0
+ (p6) cmp.ltu p0, p13 = R1, d1
+ ;; C 44
+ (p10) cmp.ne.and p0, p13 = -1, R1 C p13 = !cy
+ (p10) add R1 = 1, R1
+ (p13) br.cond.spnt .L_two_too_big C jump if not cy
+ ;; C 45
+ st8 [qp] = r18, -8
+ add r35 = -1, r35
+ mov r31 = 0 C n0, next iteration
+ br.cloop.sptk .Loop
+C *** MAIN LOOP END ***
+ ;;
+.Ldone:
+ mov r8 = r40
+ mov b0 = r41
+ add r21 = 8, r34
+ add r22 = 16, r34
+ ;;
+ st8 [r21] = R0
+ st8 [r22] = R1
+ mov ar.pfs = r42
+ mov ar.lc = r44
br.ret.sptk.many b0
- ;;
-.L51:
+
+.L_high_limb_1:
.pred.rel "mutex", p8, p9
- sub r37 = r37, r36
- (p9) sub r38 = r38, r39, 1
- (p8) sub r38 = r38, r39
- adds r40 = 1, r0
- br .L8
+ sub R0 = R0, d0
+ (p8) sub R1 = R1, d1, 1
+ (p9) sub R1 = R1, d1
+ mov r40 = 1
+ br.sptk .L8
;;
-L(fix): cmp.geu p6, p7 = r39, r38
- cmp.leu p8, p9 = r36, r37
+.L_two_too_big:
+ add R0 = R0, d0
+ add R1 = R1, d1
+ ;;
+ add r18 = -1, r18 C q--
+ cmp.ltu p10, p0 = R0, d0
+ ;;
+ (p10) add R1 = 1, R1
+ st8 [qp] = r18, -8
+ add r35 = -1, r35
+ mov r31 = 0 C n0, next iteration
+ br.cloop.sptk .Loop
+ br.sptk .Ldone
+
+.L_R1_eq_d1:
+ add r14 = R0, d1 C r = R0 + d1
+ mov r18 = -1 C q = -1
;;
- (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
- (p6) br.cond.dptk L(bck)
- sub r37 = r37, r36
- (p9) sub r38 = r38, r39, 1
- (p8) sub r38 = r38, r39
- adds r18 = 1, r18
+ cmp.leu p6, p0 = R0, r14
+ (p6) br.cond.spnt .L20 C jump unless cy
;;
- setf.sig f9 = r38 // n2
- setf.sig f10 = r37 // n1
- br L(bck)
-
+ sub P1 = r14, d0
+ add R0 = r31, d0
+ ;;
+ cmp.ltu p8, p9 = R0, r31
+ ;;
+ .pred.rel "mutex", p8, p9
+ st8 [qp] = r18, -8
+ (p8) add R1 = r0, P1, 1 C R1 = n1 - P1 - cy
+ (p9) add R1 = r0, P1 C R1 = n1 - P1
+ add r35 = -1, r35
+ mov r31 = 0 C n0, next iteration
+ br.cloop.sptk .Loop
+ br.sptk .Ldone
+ ;;
+.L20: cmp.ne p6, p7 = 0, d0
+ ;;
+ .pred.rel "mutex", p6, p7
+ (p6) add P1 = -1, d0
+ (p7) mov P1 = d0
+ sub P0 = r0, d0
+ br.sptk .L_adj
EPILOGUE()
ASM_END()
diff --git a/gmp/mpn/ia64/gcd_1.asm b/gmp/mpn/ia64/gcd_1.asm
index 28e9a63ca3..d3840a6bd8 100644
--- a/gmp/mpn/ia64/gcd_1.asm
+++ b/gmp/mpn/ia64/gcd_1.asm
@@ -1,42 +1,28 @@
dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
-dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2002-2005, 2012, 2013 Free Software Foundation, Inc.
+dnl Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/bitpair (1x1 gcd)
-C Itanium: ?
-C Itanium 2: 5.1
+C itanium2: 6.3
+C itanium: 14 (approx)
C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
@@ -59,13 +45,29 @@ C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
C stripping factors of 2 from abs(x-y). Those factors of two are
C determined from just y-x, without the abs(), since there's the same
C number of trailing zeros on n or -n in twos complement. That makes the
-C dependent chain 8 cycles deep.
+C dependent chain
+C
+C cycles
+C 1 sub x-y and x-y-1
+C 3 andcm (x-y-1)&~(x-y)
+C 2 popcnt trailing zeros
+C 3 shr.u strip abs(x-y)
+C ---
+C 9
C
C The selection of x-y versus y-x for abs(x-y), and the selection of the
-C minimum of x and y, is done in parallel with the critical path.
+C minimum of x and y, is done in parallel with the above.
C
C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
-C average, hence the final 5.8 cycles/bitpair.
+C average, hence the final 6.3 cycles/bitpair.
+C
+C The loop is not as fast as one might hope, since there's extra latency
+C from andcm going across to the `multimedia' popcnt, and vice versa from
+C multimedia shr.u back to the integer sub.
+C
+C The loop branch is .sptk.clr since we usually expect a good number of
+C iterations, and the iterations are data dependent so it's unlikely past
+C results will predict anything much about the future.
C
C Not done:
C
@@ -86,10 +88,13 @@ C only going down I0), perhaps it'd be possible to shift left instead,
C using add. That would mean keeping track of the lowest not-yet-zeroed
C bit, using some sort of mask.
C
-C TODO:
-C * Once mod_1_N exists in assembly for Itanium, add conditional calls.
-C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
-C * Probably avoid popcnt also outside of loop, instead use ctz_table.
+C Itanium-1:
+C
+C This code is not designed for itanium-1 and in fact doesn't run well on
+C that chip. The loop seems to be about 21 cycles, probably because we end
+C up with a 10 cycle replay for not forcibly scheduling the shr.u latency.
+C Lack of branch hints might introduce a couple of bubbles too.
+C
ASM_START()
.explicit C What does this mean?
@@ -98,19 +103,6 @@ C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
.global mpn_modexact_1c_odd
.type mpn_modexact_1c_odd,@function
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
- .section ".rodata"
- ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep
-ctz_table:
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
-
PROLOGUE(mpn_gcd_1)
C r32 xp
@@ -154,9 +146,13 @@ ifdef(`HAVE_ABI_32',
mov out_carry = 0
+ C
+
popcnt y_twos = y_twos C I0 y twos
;;
+ C
+
{ .mmi; add x_orig_one = -1, x_orig C M0 orig x-1
shr.u out_divisor = y, y_twos C I0 y without twos
}{ shr.u y = y, y_twos C I1 y without twos
@@ -173,62 +169,63 @@ ifdef(`HAVE_ABI_32',
mov b0 = save_rp C I0
} ;;
+ C
+
popcnt x_orig = x_orig C I0 orig x twos
+
popcnt r9 = r9 C I0 x twos
;;
+ C
+
{ cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
shr.u x = x, r9 C I0 x odd
} ;;
{ (p7) mov y_twos = x_orig C M0 common twos
add r10 = -1, y C I0 y-1
- (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y
+ (p6) br.dpnt.few .Ldone_y C B0 x%y==0 then result y
} ;;
- addl r22 = @ltoffx(ctz_table#), r1
- mov r25 = m4_lshift(MASK, MAXSHIFT)
- ;;
- ld8.mov r22 = [r22], ctz_table#
- br L(ent)
-
- ALIGN(32)
-L(top): .pred.rel "mutex", p6,p7
-.mmi; (p7) mov y = x
- (p6) sub x = x, y
- dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits)
-.mmi; and r20 = MASK, r19
- (p7) mov x = r19
- nop 0
- ;;
-L(mid):
-.mmb; ld1 r16 = [r21]
- cmp.eq p10,p0 = 0, r20
- (p10) br.spnt.few.clr L(shift_alot)
- ;;
-.mmi; nop 0
- nop 0
- shr.u x = x, r16
+ C
+
+
+ C No noticable difference in speed for the loop aligned to
+ C 32 or just 16.
+.Ltop:
+ C r8 x
+ C r10 y-1
+ C r34 y
+ C r38 common twos, for use at end
+
+{ .mmi; cmp.gtu p8,p9 = x, y C M0 x>y
+ cmp.ne p10,p0 = x, y C M1 x==y
+ sub r9 = y, x C I0 d = y - x
+}{ .mmi; sub r10 = r10, x C M2 d-1 = y - x - 1
+} ;;
+
+{ .mmi; .pred.rel "mutex", p8, p9
+ (p8) sub x = x, y C M0 x>y use x=x-y, y unchanged
+ (p9) mov y = x C M1 y>=x use y=x
+ (p9) mov x = r9 C I0 y>=x use x=y-x
+}{ .mmi; andcm r9 = r10, r9 C M2 (d-1)&~d
;;
-L(ent):
-.mmi; sub r19 = y, x
- cmp.gtu p6,p7 = x, y
- cmp.ne p8,p0 = x, y
-.mmb; nop 0
- nop 0
- (p8) br.sptk.few.clr L(top)
+
+ add r10 = -1, y C M0 new y-1
+ popcnt r9 = r9 C I0 twos on x-y
+} ;;
+
+{ shr.u x = x, r9 C I0 new x without twos
+ (p10) br.sptk.few.clr .Ltop
+} ;;
+
C result is y
-L(done_y):
- mov ar.pfs = save_pfs C I0
+.Ldone_y:
shl r8 = y, y_twos C I common factors of 2
+ ;;
+ mov ar.pfs = save_pfs C I0
br.ret.sptk.many b0
-L(shift_alot):
- and r20 = x, r25
- shr.u x = x, MAXSHIFT
- ;;
- dep r21 = x, r22, 0, MAXSHIFT
- br L(mid)
EPILOGUE()
diff --git a/gmp/mpn/ia64/gmp-mparam.h b/gmp/mpn/ia64/gmp-mparam.h
index bdbd62d974..9391887e10 100644
--- a/gmp/mpn/ia64/gmp-mparam.h
+++ b/gmp/mpn/ia64/gmp-mparam.h
@@ -1,204 +1,72 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2000-2005, 2009-2011, 2014 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 900MHz Itanium2 (titanic.gmplib.org) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.4 */
-
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 26
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD 12
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 40
-#define MUL_TOOM33_THRESHOLD 129
-#define MUL_TOOM44_THRESHOLD 212
-#define MUL_TOOM6H_THRESHOLD 318
-#define MUL_TOOM8H_THRESHOLD 430
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 208
-
-#define SQR_BASECASE_THRESHOLD 11
-#define SQR_TOOM2_THRESHOLD 82
-#define SQR_TOOM3_THRESHOLD 131
-#define SQR_TOOM4_THRESHOLD 494
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 0 /* always */
-
-#define MULMID_TOOM42_THRESHOLD 98
-
-#define MULMOD_BNM1_THRESHOLD 23
-#define SQRMOD_BNM1_THRESHOLD 19
-
-#define MUL_FFT_MODF_THRESHOLD 500 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 500, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \
- { 31, 7}, { 16, 6}, { 35, 7}, { 18, 6}, \
- { 37, 7}, { 19, 6}, { 39, 7}, { 33, 8}, \
- { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
- { 21, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \
- { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \
- { 57, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
- { 23, 9}, { 59,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
- { 55,11}, { 31,10}, { 87,11}, { 47,10}, \
- { 111,12}, { 31,11}, { 63,10}, { 143,11}, \
- { 79,10}, { 167,11}, { 95,10}, { 199,11}, \
- { 111,12}, { 63,11}, { 143,10}, { 287,11}, \
- { 159,12}, { 95,11}, { 191,10}, { 399,11}, \
- { 207,10}, { 415,13}, { 63,12}, { 127,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
- { 159,11}, { 335,10}, { 671,11}, { 367,12}, \
- { 191,11}, { 399,10}, { 799,11}, { 431,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
- { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
- { 383,11}, { 799,12}, { 415,11}, { 863,12}, \
- { 447,14}, { 127,13}, { 255,12}, { 607,13}, \
- { 319,12}, { 735,13}, { 383,12}, { 863,13}, \
- { 447,12}, { 927,11}, { 1855,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1055,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1279,13}, { 703,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \
- { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
- { 1215,14}, { 639,13}, { 1343,12}, { 2687,13}, \
- { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1727,14}, { 895,13}, { 1855,15}, { 511,14}, \
- { 1023,13}, { 2111,12}, { 4223,13}, { 2175,14}, \
- { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \
- { 1407,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 150
-#define MUL_FFT_THRESHOLD 6272
-
-#define SQR_FFT_MODF_THRESHOLD 468 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 468, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \
- { 35, 7}, { 18, 6}, { 37, 7}, { 37, 8}, \
- { 19, 7}, { 40, 8}, { 29, 9}, { 15, 8}, \
- { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 49, 9}, { 27, 8}, { 57, 9}, { 43,10}, \
- { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
- { 55,11}, { 31,10}, { 87,11}, { 47,10}, \
- { 111,12}, { 31,11}, { 63,10}, { 143,11}, \
- { 79,10}, { 167,11}, { 95,10}, { 191,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 255,11}, \
- { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 399,11}, { 207,10}, { 415,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
- { 159,11}, { 335,10}, { 671,11}, { 367,10}, \
- { 735,12}, { 191,11}, { 399,10}, { 799,11}, \
- { 431,10}, { 863,12}, { 223,11}, { 463,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
- { 735,13}, { 191,12}, { 383,11}, { 799,12}, \
- { 415,11}, { 863,12}, { 447,11}, { 895,14}, \
- { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \
- { 607,13}, { 319,12}, { 735,13}, { 383,12}, \
- { 863,13}, { 447,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
- { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1663,13}, { 895,12}, { 1791,13}, { 959,15}, \
- { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
- { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
- { 1599,12}, { 3199,13}, { 1663,14}, { 895,13}, \
- { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
- { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \
- { 1407,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 154
-#define SQR_FFT_THRESHOLD 4032
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 11616
-
-#define DC_DIV_QR_THRESHOLD 61
-#define DC_DIVAPPR_Q_THRESHOLD 222
-#define DC_BDIV_QR_THRESHOLD 95
-#define DC_BDIV_Q_THRESHOLD 264
-
-#define INV_MULMOD_BNM1_THRESHOLD 78
-#define INV_NEWTON_THRESHOLD 133
-#define INV_APPR_THRESHOLD 141
-
-#define BINV_NEWTON_THRESHOLD 248
-#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */
-#define REDC_2_TO_REDC_N_THRESHOLD 148
-
-#define MU_DIV_QR_THRESHOLD 1187
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1308
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,28,251,1925
-
-#define MATRIX22_STRASSEN_THRESHOLD 23
-#define HGCD_THRESHOLD 120
-#define HGCD_APPR_THRESHOLD 77
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 393
-#define GCDEXT_DC_THRESHOLD 440
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 1216
-#define SET_STR_PRECOMPUTE_THRESHOLD 3170
-
-#define FAC_DSC_THRESHOLD 746
-#define FAC_ODD_THRESHOLD 0 /* always */
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 1300MHz Itanium2 (babe.fsffrance.org) */
+
+/* Generated by tuneup.c, 2009-03-04, gcc 4.2 */
+
+#define MUL_KARATSUBA_THRESHOLD 44
+#define MUL_TOOM3_THRESHOLD 137
+#define MUL_TOOM44_THRESHOLD 230
+
+#define SQR_BASECASE_THRESHOLD 25
+#define SQR_KARATSUBA_THRESHOLD 119
+#define SQR_TOOM3_THRESHOLD 146
+#define SQR_TOOM4_THRESHOLD 284
+
+#define MULLOW_BASECASE_THRESHOLD 19
+#define MULLOW_DC_THRESHOLD 120
+#define MULLOW_MUL_N_THRESHOLD 357
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */
+#define DIV_DC_THRESHOLD 70
+#define POWM_THRESHOLD 312
+
+#define MATRIX22_STRASSEN_THRESHOLD 29
+#define HGCD_THRESHOLD 118
+#define GCD_DC_THRESHOLD 595
+#define GCDEXT_DC_THRESHOLD 584
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 8
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 20
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* preinv always */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 1488
+#define SET_STR_PRECOMPUTE_THRESHOLD 3590
+
+#define MUL_FFT_TABLE { 528, 1184, 1856, 3840, 11264, 28672, 114688, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 784
+#define MUL_FFT_THRESHOLD 6656
+
+#define SQR_FFT_TABLE { 592, 1248, 2368, 3840, 11264, 28672, 81920, 327680, 0 }
+#define SQR_FFT_MODF_THRESHOLD 608
+#define SQR_FFT_THRESHOLD 4992
diff --git a/gmp/mpn/ia64/hamdist.asm b/gmp/mpn/ia64/hamdist.asm
index 477df4cd71..b150a429cb 100644
--- a/gmp/mpn/ia64/hamdist.asm
+++ b/gmp/mpn/ia64/hamdist.asm
@@ -1,39 +1,25 @@
dnl IA-64 mpn_hamdist -- mpn hamming distance.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2003-2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl
+dnl This file is part of the GNU MP Library.
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C Itanium: 2
C Itanium 2: 1
C INPUT PARAMETERS
diff --git a/gmp/mpn/ia64/ia64-defs.m4 b/gmp/mpn/ia64/ia64-defs.m4
index f71d280b17..65ade991d0 100644
--- a/gmp/mpn/ia64/ia64-defs.m4
+++ b/gmp/mpn/ia64/ia64-defs.m4
@@ -2,32 +2,21 @@ divert(-1)
dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl ia64 assembler comments are C++ style "//" to the end of line. gas
@@ -86,7 +75,7 @@ dnl
dnl Emit a ".align" directive. "bytes" is eval()ed, so can be an
dnl expression.
dnl
-dnl This version overrides the definition in mpn/asm-defs.m4. We suppress
+dnl This version overrides the definition in mpn/asm-defs.m4. We supress
dnl any .align if the gas byte-swapped-nops bug was detected by configure
dnl GMP_ASM_IA64_ALIGN_OK.
@@ -99,7 +88,7 @@ m4_assert_defined(`IA64_ALIGN_OK')
dnl Usage: ASSERT([pr] [,code])
dnl
-dnl Require that the given predicate register is true after executing the
+dnl Require that the given predictate register is true after executing the
dnl test code. For example,
dnl
dnl ASSERT(p6,
@@ -131,17 +120,5 @@ define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
')')
define(`ASSERT_label_counter',1)
-define(`getfsig', `getf.sig')
-define(`setfsig', `setf.sig')
-define(`cmpeq', `cmp.eq')
-define(`cmpne', `cmp.ne')
-define(`cmpltu', `cmp.ltu')
-define(`cmpleu', `cmp.leu')
-define(`cmpgtu', `cmp.gtu')
-define(`cmpgeu', `cmp.geu')
-define(`cmple', `cmp.le')
-define(`cmpgt', `cmp.gt')
-define(`cmpeqor', `cmp.eq.or')
-define(`cmpequc', `cmp.eq.unc')
divert
diff --git a/gmp/mpn/ia64/invert_limb.asm b/gmp/mpn/ia64/invert_limb.asm
index 5effdda815..982886e549 100644
--- a/gmp/mpn/ia64/invert_limb.asm
+++ b/gmp/mpn/ia64/invert_limb.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_invert_limb -- Invert a normalized limb.
-dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
-
dnl Copyright 2000, 2002, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,7 +23,7 @@ C INPUT PARAMETERS
C d = r32
C cycles
-C Itanium: 74
+C Itanium: ?
C Itanium 2: 50+6
C It should be possible to avoid the xmpy.hu and the following tests by
diff --git a/gmp/mpn/ia64/logops_n.asm b/gmp/mpn/ia64/logops_n.asm
index e4a2f61cce..3ab9d2518b 100644
--- a/gmp/mpn/ia64/logops_n.asm
+++ b/gmp/mpn/ia64/logops_n.asm
@@ -1,35 +1,22 @@
dnl IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2003-2005 Free Software Foundation, Inc.
-
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -91,8 +78,6 @@ ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
addp4 vp = 0, vp C M I
- nop.m 0
- nop.m 0
zxt4 n = n C I
;;
')
diff --git a/gmp/mpn/ia64/lorrshift.asm b/gmp/mpn/ia64/lorrshift.asm
index 694aaf0f40..59badebc6a 100644
--- a/gmp/mpn/ia64/lorrshift.asm
+++ b/gmp/mpn/ia64/lorrshift.asm
@@ -1,57 +1,40 @@
dnl IA-64 mpn_lshift/mpn_rshift.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2005 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C Itanium: 2
-C Itanium 2: 1
+C Itanium: 2.0
+C Itanium 2: 1.0
C This code is scheduled deeply since the plain shift instructions shr and shl
C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
C these instructions cause a 10 cycle replay trap on Itanium.
-C The ld8 scheduling should probably be decreased to make the function smaller.
-C Good lfetch will make sure we never stall anyway.
-
-C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
-C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
-C in the prologue.
-
+C TODO
+C * Optimize function entry and feed-in code.
C INPUT PARAMETERS
-define(`rp', `r32')
-define(`up', `r33')
-define(`n', `r34')
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
define(`cnt',`r35')
define(`tnc',`r9')
@@ -78,281 +61,284 @@ MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
ASM_START()
PROLOGUE(func)
.prologue
- .save ar.lc, r2
+ .save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
-` addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- sxt4 n = n C M I
- nop.m 0
- nop.m 0
- zxt4 cnt = cnt C I
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ sxt4 n = n C M I
+ zxt4 cnt = cnt C I
;;
')
- {.mmi; cmp.lt p14, p15 = 4, n C M I
- and r14 = 3, n C M I
- mov.i r2 = ar.lc C I0
-}{.mmi; add r15 = -1, n C M I
- sub tnc = 64, cnt C M I
- add r16 = -5, n
- ;;
-}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
- cmp.eq p7, p0 = 2, r14 C M I
- shr.u n = r16, 2 C I0
-}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
+ {.mmi; cmp.lt p14, p15 = 4, n C M I
+ and r14 = 3, n C M I
+ mov.i r2 = ar.lc C I0
+}{.mmi; add r15 = -1, n C M I
+ sub tnc = 64, cnt C M I
+ add r16 = -5, n
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ shr.u n = r16, 2 C I0
+}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
ifdef(`OPERATION_lshift',
-` shladd up = r15, 3, up C M I
- shladd rp = r15, 3, rp') C M I
+` shladd up = r15, 3, up C M I
+ shladd rp = r15, 3, rp') C M I
;;
-}{.mmi; add r11 = POFF, up C M I
- ld8 r10 = [up], UPD C M01
- mov.i ar.lc = n C I0
+}{.mmi; add r11 = POFF, up C M I
+ ld8 r10 = [up], UPD C M01
+ mov.i ar.lc = n C I0
}{.bbb;
- (p6) br.dptk .Lb01
- (p7) br.dptk .Lb10
- (p8) br.dptk .Lb11
- ;; }
-
-.Lb00: ld8 r19 = [up], UPD
+ (p6) br.dptk .Lb01
+ (p7) br.dptk .Lb10
+ (p8) br.dptk .Lb11
;;
- ld8 r16 = [up], UPD
+}
+
+.Lb00: ld8 r19 = [up], UPD
;;
- ld8 r17 = [up], UPD
- BSH r8 = r10, tnc C function return value
+ ld8 r16 = [up], UPD
;;
- FSH r24 = r10, cnt
- BSH r25 = r19, tnc
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
(p14) br.cond.dptk .grt4
+
+ FSH r24 = r10, cnt
+ BSH r25 = r19, tnc
;;
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
;;
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
;;
- or r14 = r25, r24
- FSH r22 = r17, cnt
- BSH r23 = r10, tnc
- br .Lr4
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ BSH r23 = r10, tnc
+ br .Lr4
-.grt4: ld8 r18 = [up], UPD
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
+.grt4: FSH r24 = r10, cnt
+ BSH r25 = r19, tnc
;;
- ld8 r19 = [up], UPD
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
+ ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
;;
- ld8 r16 = [up], UPD
- FSH r22 = r17, cnt
- BSH r23 = r18, tnc
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
;;
- or r14 = r25, r24
- ld8 r17 = [up], UPD
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ or r14 = r25, r24
+ ld8 r17 = [up], UPD
br.cloop.dpnt .Ltop
- br .Lbot
+ br .Lbot
.Lb01:
- (p15) BSH r8 = r10, tnc C function return value I
- (p15) FSH r22 = r10, cnt C I
- (p15) br.cond.dptk .Lr1 C return B
+ (p15) BSH r8 = r10, tnc C function return value I
+ (p15) FSH r22 = r10, cnt C I
+ (p15) br.cond.dptk .Lr1 C return B
-.grt1: ld8 r18 = [up], UPD
+.grt1: ld8 r18 = [up], UPD
;;
- ld8 r19 = [up], UPD
- BSH r8 = r10, tnc C function return value
+ ld8 r19 = [up], UPD
+ BSH r8 = r10, tnc C function return value
;;
- ld8 r16 = [up], UPD
- FSH r22 = r10, cnt
- BSH r23 = r18, tnc
+ ld8 r16 = [up], UPD
+ FSH r22 = r10, cnt
+ BSH r23 = r18, tnc
;;
- ld8 r17 = [up], UPD
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
+ ld8 r17 = [up], UPD
br.cloop.dpnt .grt5
;;
- or r15 = r23, r22
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
+
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ;;
+ or r15 = r23, r22
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
;;
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
- br .Lr5
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ br .Lr5
-.grt5: ld8 r18 = [up], UPD
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
+.grt5: FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
;;
- ld8 r19 = [up], UPD
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
+ ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
;;
- or r15 = r23, r22
- ld8 r16 = [up], UPD
- br .LL01
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r15 = r23, r22
+ ld8 r16 = [up], UPD
+ br .LL01
-.Lb10: ld8 r17 = [up], UPD
+.Lb10: ld8 r17 = [up], UPD
(p14) br.cond.dptk .grt2
- BSH r8 = r10, tnc C function return value
+ BSH r8 = r10, tnc C function return value
;;
- FSH r20 = r10, cnt
- BSH r21 = r17, tnc
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
;;
- or r14 = r21, r20
- FSH r22 = r17, cnt
- br .Lr2 C return
+ or r14 = r21, r20
+ FSH r22 = r17, cnt
+ br .Lr2 C return
-.grt2: ld8 r18 = [up], UPD
- BSH r8 = r10, tnc C function return value
+.grt2: ld8 r18 = [up], UPD
+ BSH r8 = r10, tnc C function return value
;;
- ld8 r19 = [up], UPD
- FSH r20 = r10, cnt
- BSH r21 = r17, tnc
+ ld8 r19 = [up], UPD
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
;;
- ld8 r16 = [up], UPD
- FSH r22 = r17, cnt
- BSH r23 = r18, tnc
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
;;
- {.mmi; ld8 r17 = [up], UPD
- or r14 = r21, r20
- FSH r24 = r18, cnt
-}{.mib; nop 0
- BSH r25 = r19, tnc
+ ld8 r17 = [up], UPD
br.cloop.dpnt .grt6
- ;; }
+ ;;
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
- br .Lr6
+ or r14 = r21, r20
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ;;
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ br .Lr6
-.grt6: ld8 r18 = [up], UPD
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
+.grt6: or r14 = r21, r20
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
;;
- ld8 r19 = [up], UPD
- br .LL10
+ ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ br .LL10
-.Lb11: ld8 r16 = [up], UPD
+.Lb11: ld8 r16 = [up], UPD
;;
- ld8 r17 = [up], UPD
- BSH r8 = r10, tnc C function return value
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
(p14) br.cond.dptk .grt3
;;
- FSH r26 = r10, cnt
- BSH r27 = r16, tnc
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
;;
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
;;
- or r15 = r27, r26
- FSH r22 = r17, cnt
- br .Lr3 C return
+ or r15 = r27, r26
+ FSH r22 = r17, cnt
+ br .Lr3 C return
-.grt3: ld8 r18 = [up], UPD
- FSH r26 = r10, cnt
- BSH r27 = r16, tnc
+.grt3: ld8 r18 = [up], UPD
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
;;
- ld8 r19 = [up], UPD
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
;;
- ld8 r16 = [up], UPD
- FSH r22 = r17, cnt
- BSH r23 = r18, tnc
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
;;
- ld8 r17 = [up], UPD
+ ld8 r17 = [up], UPD
br.cloop.dpnt .grt7
- or r15 = r27, r26
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
- br .Lr7
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ br .Lr7
-.grt7: or r15 = r27, r26
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
- ld8 r18 = [up], UPD
- br .LL11
+.grt7: or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ld8 r18 = [up], UPD
+ br .LL11
C *** MAIN LOOP START ***
ALIGN(32)
.Ltop:
- {.mmi; st8 [rp] = r14, UPD C M2
- or r15 = r27, r26 C M3
- FSH r24 = r18, cnt C I0
-}{.mmi; ld8 r18 = [up], UPD C M1
- lfetch [r11], PUPD
- BSH r25 = r19, tnc C I1
+ {.mmi; st8 [rp] = r14, UPD C M2
+ or r15 = r27, r26 C M3
+ FSH r24 = r18, cnt C I0
+}{.mmi; ld8 r18 = [up], UPD C M1
+ lfetch [r11], PUPD
+ BSH r25 = r19, tnc C I1
;; }
.LL11:
- {.mmi; st8 [rp] = r15, UPD
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mmi; ld8 r19 = [up], UPD
- nop.m 0
- BSH r27 = r16, tnc
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop.m 0
+ BSH r27 = r16, tnc
;; }
.LL10:
- {.mmi; st8 [rp] = r14, UPD
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mmi; ld8 r16 = [up], UPD
- nop.m 0
- BSH r21 = r17, tnc
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop.m 0
+ BSH r21 = r17, tnc
;; }
.LL01:
- {.mmi; st8 [rp] = r15, UPD
- or r14 = r25, r24
- FSH r22 = r17, cnt
-}{.mib; ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
br.cloop.dptk .Ltop
;; }
+
C *** MAIN LOOP END ***
-.Lbot:
- {.mmi; st8 [rp] = r14, UPD
- or r15 = r27, r26
- FSH r24 = r18, cnt
-}{.mib; nop 0
- BSH r25 = r19, tnc
- nop 0
- ;; }
-.Lr7:
- {.mmi; st8 [rp] = r15, UPD
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mib; nop 0
- BSH r27 = r16, tnc
- nop 0
- ;; }
-.Lr6:
- {.mmi; st8 [rp] = r14, UPD
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mib; nop 0
- BSH r21 = r17, tnc
- nop 0
- ;; }
-.Lr5: st8 [rp] = r15, UPD
- or r14 = r25, r24
- FSH r22 = r17, cnt
+.Lbot: or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ st8 [rp] = r14, UPD
+ ;;
+.Lr7: or r14 = r21, r20
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ st8 [rp] = r15, UPD
+ ;;
+.Lr6: or r15 = r23, r22
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ st8 [rp] = r14, UPD
+ ;;
+.Lr5: st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
;;
-.Lr4: st8 [rp] = r14, UPD
- or r15 = r27, r26
+.Lr4: or r15 = r27, r26
+ st8 [rp] = r14, UPD
;;
-.Lr3: st8 [rp] = r15, UPD
- or r14 = r21, r20
+.Lr3: or r14 = r21, r20
+ st8 [rp] = r15, UPD
;;
-.Lr2: st8 [rp] = r14, UPD
+.Lr2: st8 [rp] = r14, UPD
;;
-.Lr1: st8 [rp] = r22, UPD C M23
- mov ar.lc = r2 C I0
- br.ret.sptk.many b0 C B
+.Lr1: st8 [rp] = r22, UPD C M23
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
EPILOGUE(func)
ASM_END()
diff --git a/gmp/mpn/ia64/lshiftc.asm b/gmp/mpn/ia64/lshiftc.asm
deleted file mode 100644
index c402486484..0000000000
--- a/gmp/mpn/ia64/lshiftc.asm
+++ /dev/null
@@ -1,463 +0,0 @@
-dnl IA-64 mpn_lshiftc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 1.25
-
-C This code is scheduled deeply since the plain shift instructions shr and shl
-C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
-C these instructions cause a 10 cycle replay trap on Itanium.
-
-C The ld8 scheduling should probably be decreased to make the function smaller.
-C Good lfetch will make sure we never stall anyway.
-
-C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
-C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
-C in the prologue.
-
-
-C INPUT PARAMETERS
-define(`rp', `r32')
-define(`up', `r33')
-define(`n', `r34')
-define(`cnt',`r35')
-
-define(`tnc',`r9')
-
-define(`FSH',`shl')
-define(`BSH',`shr.u')
-define(`UPD',`-8')
-define(`POFF',`-512')
-define(`PUPD',`-32')
-define(`func',`mpn_lshiftc')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',
-` addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- sxt4 n = n C M I
- nop.m 0
- nop.m 0
- zxt4 cnt = cnt C I
- ;;
-')
-
- {.mmi; nop 0 C M I
- and r14 = 3, n C M I
- mov.i r2 = ar.lc C I0
-}{.mmi; add r15 = -1, n C M I
- sub tnc = 64, cnt C M I
- nop 0
- ;;
-}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
- cmp.eq p7, p0 = 2, r14 C M I
- shr.u n = r15, 2 C I0
-}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
- shladd up = r15, 3, up C M I
- shladd rp = r15, 3, rp C M I
- ;;
-}{.mmi; add r11 = POFF, up C M I
- ld8 r10 = [up], UPD C M01
- mov.i ar.lc = n C I0
-}{.bbb;
- (p6) br.dptk .Lb01
- (p7) br.dptk .Lb10
- (p8) br.dptk .Lb11
- ;; }
-
-.Lb00:
- ld8 r19 = [up], UPD
- ;;
- ld8 r16 = [up], UPD
- ;;
- ld8 r17 = [up], UPD
- BSH r8 = r10, tnc
- br.cloop.dptk L(gt4)
- ;;
- FSH r24 = r10, cnt
- BSH r25 = r19, tnc
- ;;
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
- ;;
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
- ;;
- or r14 = r25, r24
- FSH r22 = r17, cnt
- ;;
- or r15 = r27, r26
- sub r31 = -1, r14
- br .Lr4
-
-L(gt4):
- {.mmi; nop 0
- nop 0
- FSH r24 = r10, cnt
-}{.mmi; ld8 r18 = [up], UPD
- nop 0
- BSH r25 = r19, tnc
- ;; }
- {.mmi; nop 0
- nop 0
- FSH r26 = r19, cnt
-}{.mmi; ld8 r19 = [up], UPD
- nop 0
- BSH r27 = r16, tnc
- ;; }
- {.mmi; nop 0
- nop 0
- FSH r20 = r16, cnt
-}{.mmi; ld8 r16 = [up], UPD
- nop 0
- BSH r21 = r17, tnc
- ;; }
- {.mmi; nop 0
- or r14 = r25, r24
- FSH r22 = r17, cnt
-}{.mib; ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
- br.cloop.dptk L(gt8)
- ;; }
- {.mmi; nop 0
- or r15 = r27, r26
- FSH r24 = r18, cnt
-}{.mib; sub r31 = -1, r14
- BSH r25 = r19, tnc
- br .Lr8 }
-
-L(gt8):
- or r15 = r27, r26
- FSH r24 = r18, cnt
- ld8 r18 = [up], UPD
- sub r31 = -1, r14
- BSH r25 = r19, tnc
- br .LL00
-
-.Lb01:
- br.cloop.dptk L(gt1)
- ;;
- BSH r8 = r10, tnc
- FSH r22 = r10, cnt
- ;;
- sub r31 = -1, r22
- br .Lr1
- ;;
-L(gt1):
- ld8 r18 = [up], UPD
- BSH r8 = r10, tnc
- FSH r22 = r10, cnt
- ;;
- ld8 r19 = [up], UPD
- ;;
- ld8 r16 = [up], UPD
- ;;
- ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
- br.cloop.dptk L(gt5)
- ;;
- nop 0
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
- ;;
- nop 0
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
- ;;
- or r15 = r23, r22
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
- ;;
- or r14 = r25, r24
- FSH r22 = r17, cnt
- sub r31 = -1, r15
- br .Lr5
-
-L(gt5):
- {.mmi; nop 0
- nop 0
- FSH r24 = r18, cnt
-}{.mmi; ld8 r18 = [up], UPD
- nop 0
- BSH r25 = r19, tnc
- ;; }
- {.mmi; nop 0
- nop 0
- FSH r26 = r19, cnt
-}{.mmi; ld8 r19 = [up], UPD
- nop 0
- BSH r27 = r16, tnc
- ;; }
- {.mmi; nop 0
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mmi; ld8 r16 = [up], UPD
- nop 0
- BSH r21 = r17, tnc
- ;; }
- {.mmi; or r14 = r25, r24
- sub r31 = -1, r15
- FSH r22 = r17, cnt
-}{.mib; ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
- br L(end)
- ;; }
-
-.Lb10:
- ld8 r17 = [up], UPD
- br.cloop.dptk L(gt2)
- ;;
- BSH r8 = r10, tnc
- FSH r20 = r10, cnt
- ;;
- BSH r21 = r17, tnc
- FSH r22 = r17, cnt
- ;;
- or r14 = r21, r20
- ;;
- sub r31 = -1, r14
- br .Lr2
- ;;
-L(gt2):
- ld8 r18 = [up], UPD
- BSH r8 = r10, tnc
- FSH r20 = r10, cnt
- ;;
- ld8 r19 = [up], UPD
- ;;
- ld8 r16 = [up], UPD
- BSH r21 = r17, tnc
- FSH r22 = r17, cnt
- ;;
- ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
- br.cloop.dptk L(gt6)
- ;;
- nop 0
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
- ;;
- or r14 = r21, r20
- FSH r26 = r19, cnt
- BSH r27 = r16, tnc
- ;;
- {.mmi; nop 0
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mib; sub r31 = -1, r14
- BSH r21 = r17, tnc
- br .Lr6
- ;; }
-L(gt6):
- {.mmi; nop 0
- nop 0
- FSH r24 = r18, cnt
-}{.mmi; ld8 r18 = [up], UPD
- nop 0
- BSH r25 = r19, tnc
- ;; }
- {.mmi; nop 0
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mmi; ld8 r19 = [up], UPD
- nop 0
- BSH r27 = r16, tnc
- ;; }
- {.mmi; or r15 = r23, r22
- sub r31 = -1, r14
- FSH r20 = r16, cnt
-}{.mib; ld8 r16 = [up], UPD
- BSH r21 = r17, tnc
- br .LL10
-}
-
-.Lb11:
- ld8 r16 = [up], UPD
- ;;
- ld8 r17 = [up], UPD
- BSH r8 = r10, tnc
- FSH r26 = r10, cnt
- br.cloop.dptk L(gt3)
- ;;
- BSH r27 = r16, tnc
- ;;
- FSH r20 = r16, cnt
- BSH r21 = r17, tnc
- ;;
- FSH r22 = r17, cnt
- ;;
- or r15 = r27, r26
- ;;
- or r14 = r21, r20
- sub r31 = -1, r15
- br .Lr3
- ;;
-L(gt3):
- ld8 r18 = [up], UPD
- ;;
- ld8 r19 = [up], UPD
- BSH r27 = r16, tnc
- ;;
- {.mmi; nop 0
- nop 0
- FSH r20 = r16, cnt
-}{.mmi; ld8 r16 = [up], UPD
- nop 0
- BSH r21 = r17, tnc
- ;; }
- {.mmi nop 0
- nop 0
- FSH r22 = r17, cnt
-}{.mib; ld8 r17 = [up], UPD
- BSH r23 = r18, tnc
- br.cloop.dptk L(gt7)
- ;; }
- or r15 = r27, r26
- FSH r24 = r18, cnt
- BSH r25 = r19, tnc
- ;;
- {.mmi; nop 0
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mib; sub r31 = -1, r15
- BSH r27 = r16, tnc
- br .Lr7
-}
-L(gt7):
- {.mmi; nop 0
- or r15 = r27, r26
- FSH r24 = r18, cnt
-}{.mmi; ld8 r18 = [up], UPD
- nop 0
- BSH r25 = r19, tnc
- ;; }
- {.mmi; or r14 = r21, r20
- sub r31 = -1, r15
- FSH r26 = r19, cnt
-}{.mib; ld8 r19 = [up], UPD
- BSH r27 = r16, tnc
- br .LL11
-}
-
-C *** MAIN LOOP START ***
- ALIGN(32)
-L(top):
-.LL01:
- {.mmi; st8 [rp] = r31, UPD C M2
- or r15 = r27, r26 C M3
- FSH r24 = r18, cnt C I0
-}{.mmi; ld8 r18 = [up], UPD C M0
- sub r31 = -1, r14 C M1
- BSH r25 = r19, tnc C I1
- ;; }
-.LL00:
- {.mmi; st8 [rp] = r31, UPD
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mmi; ld8 r19 = [up], UPD
- sub r31 = -1, r15
- BSH r27 = r16, tnc
- ;; }
-.LL11:
- {.mmi; st8 [rp] = r31, UPD
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mmi; ld8 r16 = [up], UPD
- sub r31 = -1, r14
- BSH r21 = r17, tnc
- ;; }
-.LL10:
- {.mmi; st8 [rp] = r31, UPD
- or r14 = r25, r24
- FSH r22 = r17, cnt
-}{.mmi; ld8 r17 = [up], UPD
- sub r31 = -1, r15
- BSH r23 = r18, tnc
- ;; }
-L(end): lfetch [r11], PUPD
- br.cloop.dptk L(top)
-C *** MAIN LOOP END ***
-
- {.mmi; st8 [rp] = r31, UPD
- or r15 = r27, r26
- FSH r24 = r18, cnt
-}{.mib; sub r31 = -1, r14
- BSH r25 = r19, tnc
- nop 0
- ;; }
-.Lr8:
- {.mmi; st8 [rp] = r31, UPD
- or r14 = r21, r20
- FSH r26 = r19, cnt
-}{.mib; sub r31 = -1, r15
- BSH r27 = r16, tnc
- nop 0
- ;; }
-.Lr7:
- {.mmi; st8 [rp] = r31, UPD
- or r15 = r23, r22
- FSH r20 = r16, cnt
-}{.mib; sub r31 = -1, r14
- BSH r21 = r17, tnc
- nop 0
- ;; }
-.Lr6: st8 [rp] = r31, UPD
- or r14 = r25, r24
- FSH r22 = r17, cnt
- sub r31 = -1, r15
- ;;
-.Lr5: st8 [rp] = r31, UPD
- or r15 = r27, r26
- sub r31 = -1, r14
- ;;
-.Lr4: st8 [rp] = r31, UPD
- or r14 = r21, r20
- sub r31 = -1, r15
- ;;
-.Lr3: st8 [rp] = r31, UPD
- sub r31 = -1, r14
- ;;
-.Lr2: st8 [rp] = r31, UPD
- sub r31 = -1, r22
- ;;
-.Lr1: st8 [rp] = r31, UPD C M23
- mov ar.lc = r2 C I0
- br.ret.sptk.many b0 C B
-EPILOGUE(func)
-ASM_END()
diff --git a/gmp/mpn/ia64/mod_34lsub1.asm b/gmp/mpn/ia64/mod_34lsub1.asm
deleted file mode 100644
index edf3602c4c..0000000000
--- a/gmp/mpn/ia64/mod_34lsub1.asm
+++ /dev/null
@@ -1,236 +0,0 @@
-dnl IA-64 mpn_mod_34lsub1
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 1
-
-
-C INPUT PARAMETERS
-define(`up', `r32')
-define(`n', `r33')
-
-C Some useful aliases for registers we use
-define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
-define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
-define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
-
-C This is a fairly simple-minded implementation. One could approach 0.67 c/l
-C with a more sophisticated implementation. If we're really crazy, we could
-C super-unroll, storing carries just in predicate registers, then copy them to
-C a general register, and population count them from there. That'd bring us
-C close to 3 insn/limb, for nearly 0.5 c/l.
-
-C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
-C We therefore use a plain while-style loop:
-C add n = -3, n
-C cmp.le p9, p0 = 3, n
-C (p9) br.cond .Loop
-C Alternatively, we could table n/3 for, say, n < 256, and predicate the
-C 16-cycle code.
-
-C The summing-up code at the end was written quickly, and could surely be
-C vastly improved.
-
-ASM_START()
-PROLOGUE(mpn_mod_34lsub1)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
- addp4 up = 0, up C M I
- nop.m 0
- zxt4 n = n C I
- ;;
-')
-
-ifelse(0,1,`
- movl r14 = 0xAAAAAAAAAAAAAAAB
- ;;
- setf.sig f6 = r14
- setf.sig f7 = r33
- ;;
- xmpy.hu f6 = f6, f7
- ;;
- getf.sig r8 = f6
- ;;
- shr.u r8 = r8, 1 C Loop count
- ;;
- mov.i ar.lc = r8
-')
-
- ld8 u0 = [up], 8
- cmp.ne p9, p0 = 1, n
- (p9) br L(gt1)
- ;;
- shr.u r8 = u0, 48
- dep.z r27 = u0, 0, 48
- ;;
- add r8 = r8, r27
- br.ret.sptk.many b0
-
-
-L(gt1):
-.mmi; nop.m 0
- mov a0 = 0
- add n = -2, n
-.mmi; mov c0 = 0
- mov c1 = 0
- mov c2 = 0
- ;;
-.mmi; ld8 u1 = [up], 8
- mov a1 = 0
- cmp.ltu p6, p0 = r0, r0 C clear p6
-.mmb; cmp.gt p9, p0 = 3, n
- mov a2 = 0
- (p9) br.cond.dptk L(end)
- ;;
-
- ALIGN(32)
-L(top):
-.mmi; ld8 u2 = [up], 8
- (p6) add c0 = 1, c0
- cmp.ltu p7, p0 = a0, u0
-.mmb; sub a0 = a0, u0
- add n = -3, n
- nop.b 0
- ;;
-.mmi; ld8 u0 = [up], 8
- (p7) add c1 = 1, c1
- cmp.ltu p8, p0 = a1, u1
-.mmb; sub a1 = a1, u1
- cmp.le p9, p0 = 3, n
- nop.b 0
- ;;
-.mmi; ld8 u1 = [up], 8
- (p8) add c2 = 1, c2
- cmp.ltu p6, p0 = a2, u2
-.mmb; sub a2 = a2, u2
- nop.m 0
-dnl br.cloop.dptk L(top)
- (p9) br.cond.dptk L(top)
- ;;
-
-L(end):
- cmp.eq p10, p0 = 0, n
- cmp.eq p11, p0 = 1, n
- (p10) br L(0)
-
-L(2):
-.mmi; ld8 u2 = [up], 8
- (p6) add c0 = 1, c0
- cmp.ltu p7, p0 = a0, u0
-.mmb; sub a0 = a0, u0
- nop.m 0
- (p11) br L(1)
- ;;
- ld8 u0 = [up], 8
- (p7) add c1 = 1, c1
- cmp.ltu p8, p0 = a1, u1
- sub a1 = a1, u1
- ;;
- (p8) add c2 = 1, c2
- cmp.ltu p6, p0 = a2, u2
- sub a2 = a2, u2
- ;;
- (p6) add c0 = 1, c0
- cmp.ltu p7, p0 = a0, u0
- sub a0 = a0, u0
- ;;
- (p7) add c1 = 1, c1
- br L(com)
-
-
-L(1):
- (p7) add c1 = 1, c1
- cmp.ltu p8, p0 = a1, u1
- sub a1 = a1, u1
- ;;
- (p8) add c2 = 1, c2
- cmp.ltu p6, p0 = a2, u2
- sub a2 = a2, u2
- ;;
- (p6) add c0 = 1, c0
- br L(com)
-
-
-L(0):
- (p6) add c0 = 1, c0
- cmp.ltu p7, p0 = a0, u0
- sub a0 = a0, u0
- ;;
- (p7) add c1 = 1, c1
- cmp.ltu p8, p0 = a1, u1
- sub a1 = a1, u1
- ;;
- (p8) add c2 = 1, c2
-
-L(com):
-C | a2 | a1 | a0 |
-C | | | | |
- shr.u r24 = a0, 48 C 16 bits
- shr.u r25 = a1, 32 C 32 bits
- shr.u r26 = a2, 16 C 48 bits
- ;;
- shr.u r10 = c0, 48 C 16 bits, always zero
- shr.u r11 = c1, 32 C 32 bits
- shr.u r30 = c2, 16 C 48 bits
- ;;
- dep.z r27 = a0, 0, 48 C 48 bits
- dep.z r28 = a1, 16, 32 C 48 bits
- dep.z r29 = a2, 32, 16 C 48 bits
- dep.z r31 = c0, 0, 48 C 48 bits
- dep.z r14 = c1, 16, 32 C 48 bits
- dep.z r15 = c2, 32, 16 C 48 bits
- ;;
-.mmi; add r24 = r24, r25
- add r26 = r26, r27
- add r28 = r28, r29
-.mmi; add r10 = r10, r11
- add r30 = r30, r31
- add r14 = r14, r15
- ;;
- movl r8 = 0xffffffffffff0
- add r24 = r24, r26
- add r10 = r10, r30
- ;;
- add r24 = r24, r28
- add r10 = r10, r14
- ;;
- sub r8 = r8, r24
- ;;
- add r8 = r8, r10
- br.ret.sptk.many b0
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/ia64/mode1o.asm b/gmp/mpn/ia64/mode1o.asm
index 14d5e81602..6b3626ebe6 100644
--- a/gmp/mpn/ia64/mode1o.asm
+++ b/gmp/mpn/ia64/mode1o.asm
@@ -1,34 +1,21 @@
dnl Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
-dnl Contributed to the GNU project by Kevin Ryde.
-
-dnl Copyright 2003-2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ia64/mul_1.asm b/gmp/mpn/ia64/mul_1.asm
index 21bf6d0e14..8df8d93f8e 100644
--- a/gmp/mpn/ia64/mul_1.asm
+++ b/gmp/mpn/ia64/mul_1.asm
@@ -1,35 +1,23 @@
dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
dnl store the result in a second limb vector.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -553,6 +541,7 @@ C *** MAIN LOOP END ***
(p6) cmp.leu p8, p9 = r24, r17
(p7) cmp.ltu p8, p9 = r24, r17
;;
+ .pred.rel "mutex",p8,p9
(p8) add r8 = 1, r8
mov.i ar.lc = r2
br.ret.sptk.many b0
diff --git a/gmp/mpn/ia64/mul_2.asm b/gmp/mpn/ia64/mul_2.asm
index 2bbce97267..b0d4ef70a1 100644
--- a/gmp/mpn/ia64/mul_2.asm
+++ b/gmp/mpn/ia64/mul_2.asm
@@ -1,47 +1,39 @@
dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
dnl store the result to a (n+1)-limb number.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2004, 2011 Free Software Foundation, Inc.
+dnl Copyright 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C Itanium: ?
-C Itanium 2: 1.5
+C Itanium: 3.15
+C Itanium 2: 1.625
+
+C Note that this is very similar to addmul_2.asm. If you change this file,
+C please change that file too.
C TODO
C * Clean up variable names, and try to decrease the number of distinct
C registers used.
-C * Clean up feed-in code to not require zeroing several registers.
+C * Cleanup feed-in code to not require zeroing several registers.
C * Make sure we don't depend on uninitialized predicate registers.
+C * We currently cross-jump very aggressively, at the expense of a few cycles
+C per operation. Consider changing that.
C * Could perhaps save a few cycles by using 1 c/l carry propagation in
C wind-down code.
C * Ultimately rewrite. The problem with this code is that it first uses a
@@ -102,519 +94,598 @@ PROLOGUE(mpn_mul_2)
.save ar.lc, r2
.body
-ifdef(`HAVE_ABI_32',`
-.mmi; addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- addp4 vp = 0, vp C M I
-.mmi; nop 1
- nop 1
- zxt4 n = n C I
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
;;')
-.mmi; ldf8 ux = [up], 8 C M
- ldf8 v0 = [vp], 8 C M
- mov r2 = ar.lc C I0
-.mmi; nop 1 C M
- and r14 = 3, n C M I
- add n = -2, n C M I
- ;;
-.mmi; ldf8 uy = [up], 8 C M
- ldf8 v1 = [vp] C M
- shr.u n = n, 2 C I0
-.mmi; nop 1 C M
- cmp.eq p10, p0 = 1, r14 C M I
- cmp.eq p11, p0 = 2, r14 C M I
- ;;
-.mmi; nop 1 C M
- cmp.eq p12, p0 = 3, r14 C M I
- mov ar.lc = n C I0
-.bbb; (p10) br.dptk L(b01) C B
- (p11) br.dptk L(b10) C B
- (p12) br.dptk L(b11) C B
- ;;
+{.mmi C 00
+ ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov.i r2 = ar.lc C I0
+}{.mmi
+ nop 0 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+}{.mmi C 01
+ ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I
+}{.mmi
+ nop 0 C M
+ cmp.eq p10, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+}{.mmi C 02
+ nop 0 C M
+ cmp.eq p12, p0 = 3, r14 C M I
+ mov.i ar.lc = n C I0
+}{.bbb
+ (p10) br.dptk .Lb01 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
ALIGN(32)
-L(b00): ldf8 u_1 = [up], 8
- mov acc1_2 = 0
- mov pr1_2 = 0
- mov pr0_3 = 0
- cmp.ne p8, p9 = r0, r0
+.Lb00: ldf8 u_1 = [up], 8
+ mov acc1_2 = 0
+ mov pr1_2 = 0
+ mov pr0_3 = 0
+ cmp.ne p8, p9 = r0, r0
;;
- xma.l fp0b_3 = ux, v0, f0
- cmp.ne p12, p13 = r0, r0
- ldf8 u_2 = [up], 8
- xma.hu fp1a_3 = ux, v0, f0
- br.cloop.dptk L(gt4)
+ xma.l fp0b_3 = ux, v0, f0
+ cmp.ne p12, p13 = r0, r0
+ ldf8 u_2 = [up], 8
+ xma.hu fp1a_3 = ux, v0, f0
+ br.cloop.dptk .grt4
- xma.l fp0b_0 = uy, v0, f0
- xma.hu fp1a_0 = uy, v0, f0
+ xma.l fp0b_0 = uy, v0, f0
+ xma.hu fp1a_0 = uy, v0, f0
;;
- getfsig acc0 = fp0b_3
- xma.l fp1b_3 = ux, v1, fp1a_3
- xma.hu fp2a_3 = ux, v1, fp1a_3
+ getf.sig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ xma.hu fp2a_3 = ux, v1, fp1a_3
;;
- xma.l fp0b_1 = u_1, v0, f0
- xma.hu fp1a_1 = u_1, v0, f0
+ xma.l fp0b_1 = u_1, v0, f0
+ xma.hu fp1a_1 = u_1, v0, f0
;;
- getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
;;
- getfsig pr1_3 = fp1b_3
- getfsig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, f0
- xma.hu fp1a_2 = u_2, v0, f0
- br L(cj4)
+ getf.sig pr1_3 = fp1b_3
+ getf.sig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
+ br .Lcj4
-L(gt4): xma.l fp0b_0 = uy, v0, f0
- xma.hu fp1a_0 = uy, v0, f0
+.grt4: xma.l fp0b_0 = uy, v0, f0
+ xma.hu fp1a_0 = uy, v0, f0
;;
- getfsig acc0 = fp0b_3
- xma.l fp1b_3 = ux, v1, fp1a_3
- ldf8 u_3 = [up], 8
- xma.hu fp2a_3 = ux, v1, fp1a_3
+ getf.sig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ ldf8 u_3 = [up], 8
+ xma.hu fp2a_3 = ux, v1, fp1a_3
;;
- xma.l fp0b_1 = u_1, v0, f0
- xma.hu fp1a_1 = u_1, v0, f0
+ xma.l fp0b_1 = u_1, v0, f0
+ xma.hu fp1a_1 = u_1, v0, f0
;;
- getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
;;
- ldf8 u_0 = [up], 8
- getfsig pr1_3 = fp1b_3
- xma.l fp0b_2 = u_2, v0, f0
+ ldf8 u_0 = [up], 8
+ getf.sig pr1_3 = fp1b_3
;;
- getfsig acc1_3 = fp2a_3
- xma.hu fp1a_2 = u_2, v0, f0
- br L(00)
+ getf.sig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
+ br .LL00
ALIGN(32)
-L(b01): ldf8 u_0 = [up], 8 C M
- mov acc1_1 = 0 C M I
- mov pr1_1 = 0 C M I
- mov pr0_2 = 0 C M I
- cmp.ne p6, p7 = r0, r0 C M I
+.Lb01: ldf8 u_0 = [up], 8 C M
+ mov acc1_1 = 0 C M I
+ mov pr1_1 = 0 C M I
+ mov pr0_2 = 0 C M I
+ cmp.ne p6, p7 = r0, r0 C M I
;;
- xma.l fp0b_2 = ux, v0, f0 C F
- cmp.ne p10, p11 = r0, r0 C M I
- ldf8 u_1 = [up], 8 C M
- xma.hu fp1a_2 = ux, v0, f0 C F
+ xma.l fp0b_2 = ux, v0, f0 C F
+ cmp.ne p10, p11 = r0, r0 C M I
+ ldf8 u_1 = [up], 8 C M
+ xma.hu fp1a_2 = ux, v0, f0 C F
;;
- xma.l fp0b_3 = uy, v0, f0 C F
- xma.hu fp1a_3 = uy, v0, f0 C F
+ xma.l fp0b_3 = uy, v0, f0 C F
+ xma.hu fp1a_3 = uy, v0, f0 C F
;;
- getfsig acc0 = fp0b_2 C M
- xma.l fp1b_2 = ux, v1,fp1a_2 C F
- ldf8 u_2 = [up], 8 C M
- xma.hu fp2a_2 = ux, v1,fp1a_2 C F
- br.cloop.dptk L(gt5)
+ getf.sig acc0 = fp0b_2 C M
+ xma.l fp1b_2 = ux, v1,fp1a_2 C F
+ xma.hu fp2a_2 = ux, v1,fp1a_2 C F
+ ldf8 u_2 = [up], 8 C M
+ br.cloop.dptk .grt5
- xma.l fp0b_0 = u_0, v0, f0 C F
- xma.hu fp1a_0 = u_0, v0, f0 C F
+ xma.l fp0b_0 = u_0, v0, f0 C F
+ xma.hu fp1a_0 = u_0, v0, f0 C F
;;
- getfsig pr0_3 = fp0b_3 C M
- xma.l fp1b_3 = uy, v1,fp1a_3 C F
- xma.hu fp2a_3 = uy, v1,fp1a_3 C F
+ getf.sig pr0_3 = fp0b_3 C M
+ xma.l fp1b_3 = uy, v1,fp1a_3 C F
+ xma.hu fp2a_3 = uy, v1,fp1a_3 C F
;;
- getfsig pr1_2 = fp1b_2 C M
- getfsig acc1_2 = fp2a_2 C M
- xma.l fp0b_1 = u_1, v0, f0 C F
- xma.hu fp1a_1 = u_1, v0, f0 C F
- br L(cj5)
+ getf.sig pr1_2 = fp1b_2 C M
+ getf.sig acc1_2 = fp2a_2 C M
+ xma.l fp0b_1 = u_1, v0, f0 C F
+ xma.hu fp1a_1 = u_1, v0, f0 C F
+ br .Lcj5
-L(gt5): xma.l fp0b_0 = u_0, v0, f0
- xma.hu fp1a_0 = u_0, v0, f0
+.grt5: xma.l fp0b_0 = u_0, v0, f0
+ xma.hu fp1a_0 = u_0, v0, f0
;;
- getfsig pr0_3 = fp0b_3
- xma.l fp1b_3 = uy, v1, fp1a_3
- xma.hu fp2a_3 = uy, v1, fp1a_3
+ getf.sig pr0_3 = fp0b_3
+ xma.l fp1b_3 = uy, v1, fp1a_3
+ xma.hu fp2a_3 = uy, v1, fp1a_3
;;
- ldf8 u_3 = [up], 8
- getfsig pr1_2 = fp1b_2
- xma.l fp0b_1 = u_1, v0, f0
+ ldf8 u_3 = [up], 8
+ getf.sig pr1_2 = fp1b_2
;;
- getfsig acc1_2 = fp2a_2
- xma.hu fp1a_1 = u_1, v0, f0
- br L(01)
+ getf.sig acc1_2 = fp2a_2
+ xma.l fp0b_1 = u_1, v0, f0
+ xma.hu fp1a_1 = u_1, v0, f0
+ br .LL01
+C We have two variants for n = 2. They turn out to run at exactly the same
+C speed. But the first, odd variant might allow one cycle to be trimmed.
ALIGN(32)
-L(b10): br.cloop.dptk L(gt2)
- xma.l fp0b_1 = ux, v0, f0
- xma.hu fp1a_1 = ux, v0, f0
- ;;
- xma.l fp0b_2 = uy, v0, f0
- xma.hu fp1a_2 = uy, v0, f0
- ;;
- stf8 [rp] = fp0b_1, 8
- xma.l fp1b_1 = ux, v1, fp1a_1
- xma.hu fp2a_1 = ux, v1, fp1a_1
- ;;
- getfsig acc0 = fp0b_2
- xma.l fp1b_2 = uy, v1, fp1a_2
- xma.hu fp2a_2 = uy, v1, fp1a_2
- ;;
- getfsig pr1_1 = fp1b_1
- getfsig acc1_1 = fp2a_1
- mov ar.lc = r2
- getfsig pr1_2 = fp1b_2
- getfsig r8 = fp2a_2
- ;;
- add s0 = pr1_1, acc0
- ;;
- st8 [rp] = s0, 8
- cmp.ltu p8, p9 = s0, pr1_1
- sub r31 = -1, acc1_1
- ;;
- .pred.rel "mutex", p8, p9
- (p8) add acc0 = pr1_2, acc1_1, 1
- (p9) add acc0 = pr1_2, acc1_1
- (p8) cmp.leu p10, p0 = r31, pr1_2
- (p9) cmp.ltu p10, p0 = r31, pr1_2
- ;;
- st8 [rp] = acc0, 8
- (p10) add r8 = 1, r8
- br.ret.sptk.many b0
-
-L(gt2): ldf8 u_3 = [up], 8
- mov acc1_0 = 0
- mov pr1_0 = 0
- ;;
- mov pr0_1 = 0
- xma.l fp0b_1 = ux, v0, f0
- ldf8 u_0 = [up], 8
- xma.hu fp1a_1 = ux, v0, f0
- ;;
- xma.l fp0b_2 = uy, v0, f0
- xma.hu fp1a_2 = uy, v0, f0
- ;;
- getfsig acc0 = fp0b_1
- xma.l fp1b_1 = ux, v1, fp1a_1
- xma.hu fp2a_1 = ux, v1, fp1a_1
- ;;
- ldf8 u_1 = [up], 8
- xma.l fp0b_3 = u_3, v0, f0
- xma.hu fp1a_3 = u_3, v0, f0
- ;;
- getfsig pr0_2 = fp0b_2
- xma.l fp1b_2 = uy, v1, fp1a_2
- xma.hu fp2a_2 = uy, v1, fp1a_2
- ;;
- ldf8 u_2 = [up], 8
- getfsig pr1_1 = fp1b_1
- ;;
-.mfi; getfsig acc1_1 = fp2a_1
- xma.l fp0b_0 = u_0, v0, f0
- cmp.ne p8, p9 = r0, r0
-.mfb; cmp.ne p12, p13 = r0, r0
- xma.hu fp1a_0 = u_0, v0, f0
- br L(10)
+ifdef(`',`
+.Lb10: C 03
+ br.cloop.dptk .grt2
+ C 04
+ C 05
+ C 06
+ xma.l fp0b_1 = ux, v0, f0 C 0
+ xma.hu fp1a_1 = ux, v0, f0 C 1
+ ;; C 07
+ xma.l fp0b_2 = uy, v0, f0 C 1
+ xma.l fp1b_1 = ux, v1, f0 C 1
+ ;; C 08
+ xma.hu fp1a_2 = uy, v0, f0 C 2
+ xma.hu fp2a_1 = ux, v1, f0 C 2
+ ;; C 09
+ xma.l fp1b_2 = uy, v1, f0 C 2
+ xma.hu fp2a_2 = uy, v1, f0 C 3
+ ;; C 10
+ getf.sig r16 = fp1a_1
+ stf8 [rp] = fp0b_1, 8
+ ;; C 11
+ getf.sig r17 = fp0b_2
+ C 12
+ getf.sig r18 = fp1b_1
+ C 13
+ getf.sig r19 = fp1a_2
+ C 14
+ getf.sig r20 = fp2a_1
+ C 15
+ getf.sig r21 = fp1b_2
+ ;; C 16
+ getf.sig r8 = fp2a_2
+ add r24 = r16, r17
+ ;; C 17
+ cmp.ltu p6, p7 = r24, r16
+ add r26 = r24, r18
+ ;; C 18
+ cmp.ltu p8, p9 = r26, r24
+ ;; C 19
+ st8 [rp] = r26, 8
+ (p6) add r25 = r19, r20, 1
+ (p7) add r25 = r19, r20
+ ;; C 20
+ (p8) add r27 = r25, r21, 1
+ (p9) add r27 = r25, r21
+ (p6) cmp.leu p10, p0 = r25, r19
+ (p7) cmp.ltu p10, p0 = r25, r19
+ ;; C 21
+ (p10) add r8 = 1, r8
+ (p8) cmp.leu p12, p0 = r27, r25
+ (p9) cmp.ltu p12, p0 = r27, r25
+ ;; C 22
+ st8 [rp] = r27, 8
+ mov.i ar.lc = r2
+ (p12) add r8 = 1, r8
+ br.ret.sptk.many b0
+')
+
+.Lb10: C 03
+ br.cloop.dptk .grt2
+ C 04
+ C 05
+ C 06
+ xma.l fp0b_1 = ux, v0, f0
+ xma.hu fp1a_1 = ux, v0, f0
+ ;; C 07
+ xma.l fp0b_2 = uy, v0, f0
+ xma.hu fp1a_2 = uy, v0, f0
+ ;; C 08
+ C 09
+ C 10
+ stf8 [rp] = fp0b_1, 8
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;; C 11
+ getf.sig acc0 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;; C 12
+ C 13
+ C 14
+ getf.sig pr1_1 = fp1b_1
+ C 15
+ getf.sig acc1_1 = fp2a_1
+ C 16
+ getf.sig pr1_2 = fp1b_2
+ C 17
+ getf.sig r8 = fp2a_2
+ ;; C 18
+ C 19
+ add s0 = pr1_1, acc0
+ ;; C 20
+ st8 [rp] = s0, 8
+ cmp.ltu p8, p9 = s0, pr1_1
+ sub r31 = -1, acc1_1
+ ;; C 21
+ .pred.rel "mutex", p8, p9
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ (p8) cmp.leu p10, p0 = r31, pr1_2
+ (p9) cmp.ltu p10, p0 = r31, pr1_2
+ ;; C 22
+ st8 [rp] = acc0, 8
+ mov.i ar.lc = r2
+ (p10) add r8 = 1, r8
+ br.ret.sptk.many b0
+
+
+.grt2: ldf8 u_3 = [up], 8
+ mov acc1_0 = 0
+ mov pr1_0 = 0
+ ;;
+ mov pr0_1 = 0
+ xma.l fp0b_1 = ux, v0, f0
+ ldf8 u_0 = [up], 8
+ xma.hu fp1a_1 = ux, v0, f0
+ ;;
+ xma.l fp0b_2 = uy, v0, f0
+ xma.hu fp1a_2 = uy, v0, f0
+ ;;
+ getf.sig acc0 = fp0b_1
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;;
+ ldf8 u_1 = [up], 8
+ xma.l fp0b_3 = u_3, v0, f0
+ xma.hu fp1a_3 = u_3, v0, f0
+ ;;
+ getf.sig pr0_2 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ ldf8 u_2 = [up], 8
+ getf.sig pr1_1 = fp1b_1
+ ;;
+ getf.sig acc1_1 = fp2a_1
+ xma.l fp0b_0 = u_0, v0, f0
+ cmp.ne p8, p9 = r0, r0
+ cmp.ne p12, p13 = r0, r0
+ xma.hu fp1a_0 = u_0, v0, f0
+ br .LL10
ALIGN(32)
-L(b11): mov acc1_3 = 0
- mov pr1_3 = 0
- mov pr0_0 = 0
- ldf8 u_2 = [up], 8
- cmp.ne p6, p7 = r0, r0
- br.cloop.dptk L(gt3)
+.Lb11: mov acc1_3 = 0
+ mov pr1_3 = 0
+ mov pr0_0 = 0
+ cmp.ne p6, p7 = r0, r0
+ ;;
+ ldf8 u_2 = [up], 8
+ br.cloop.dptk .grt3
;;
- xma.l fp0b_0 = ux, v0, f0
- xma.hu fp1a_0 = ux, v0, f0
+ xma.l fp0b_0 = ux, v0, f0
+ xma.hu fp1a_0 = ux, v0, f0
;;
- cmp.ne p10, p11 = r0, r0
- xma.l fp0b_1 = uy, v0, f0
- xma.hu fp1a_1 = uy, v0, f0
+ cmp.ne p10, p11 = r0, r0
+ xma.l fp0b_1 = uy, v0, f0
+ xma.hu fp1a_1 = uy, v0, f0
;;
- getfsig acc0 = fp0b_0
- xma.l fp1b_0 = ux, v1, fp1a_0
- xma.hu fp2a_0 = ux, v1, fp1a_0
+ getf.sig acc0 = fp0b_0
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ xma.hu fp2a_0 = ux, v1, fp1a_0
;;
- xma.l fp0b_2 = u_2, v0, f0
- xma.hu fp1a_2 = u_2, v0, f0
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
;;
- getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = uy, v1, fp1a_1
- xma.hu fp2a_1 = uy, v1, fp1a_1
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
;;
- getfsig pr1_0 = fp1b_0
- getfsig acc1_0 = fp2a_0
- br L(cj3)
+ getf.sig pr1_0 = fp1b_0
+ getf.sig acc1_0 = fp2a_0
+ br .Lcj3
-L(gt3): xma.l fp0b_0 = ux, v0, f0
- cmp.ne p10, p11 = r0, r0
- ldf8 u_3 = [up], 8
- xma.hu fp1a_0 = ux, v0, f0
+.grt3: xma.l fp0b_0 = ux, v0, f0
+ cmp.ne p10, p11 = r0, r0
+ ldf8 u_3 = [up], 8
+ xma.hu fp1a_0 = ux, v0, f0
;;
- xma.l fp0b_1 = uy, v0, f0
- xma.hu fp1a_1 = uy, v0, f0
+ xma.l fp0b_1 = uy, v0, f0
+ xma.hu fp1a_1 = uy, v0, f0
;;
- getfsig acc0 = fp0b_0
- xma.l fp1b_0 = ux, v1, fp1a_0
- ldf8 u_0 = [up], 8
- xma.hu fp2a_0 = ux, v1, fp1a_0
+ getf.sig acc0 = fp0b_0
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ ldf8 u_0 = [up], 8
+ xma.hu fp2a_0 = ux, v1, fp1a_0
;;
- xma.l fp0b_2 = u_2, v0, f0
- xma.hu fp1a_2 = u_2, v0, f0
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
;;
- getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = uy, v1, fp1a_1
- xma.hu fp2a_1 = uy, v1, fp1a_1
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
;;
- ldf8 u_1 = [up], 8
- getfsig pr1_0 = fp1b_0
+ ldf8 u_1 = [up], 8
+ getf.sig pr1_0 = fp1b_0
;;
- getfsig acc1_0 = fp2a_0
- xma.l fp0b_3 = u_3, v0, f0
- xma.hu fp1a_3 = u_3, v0, f0
- br L(11)
+ getf.sig acc1_0 = fp2a_0
+ xma.l fp0b_3 = u_3, v0, f0
+ xma.hu fp1a_3 = u_3, v0, f0
+ br .LL11
C *** MAIN LOOP START ***
ALIGN(32)
-L(top): C 00
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
- ldf8 u_3 = [up], 8
- getfsig pr1_2 = fp1b_2
- (p8) cmp.leu p6, p7 = acc0, pr0_1
- (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
+.Loop: C 00
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_3 = fp0b_3
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
;; C 01
- .pred.rel "mutex", p6, p7
- getfsig acc1_2 = fp2a_2
- st8 [rp] = s0, 8
- xma.l fp0b_1 = u_1, v0, f0
- (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- xma.hu fp1a_1 = u_1, v0, f0
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_3 = [up], 8
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
;; C 02
-L(01):
- .pred.rel "mutex", p10, p11
- getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = u_0, v1, fp1a_0
- (p10) add s0 = pr1_1, acc0, 1
- (p11) add s0 = pr1_1, acc0
- xma.hu fp2a_0 = u_0, v1, fp1a_0
- nop 1
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, f0
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, f0
;; C 03
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
- ldf8 u_0 = [up], 8
- getfsig pr1_3 = fp1b_3
- (p6) cmp.leu p8, p9 = acc0, pr0_2
- (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
+.LL01:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
;; C 04
- .pred.rel "mutex", p8, p9
- getfsig acc1_3 = fp2a_3
- st8 [rp] = s0, 8
- xma.l fp0b_2 = u_2, v0, f0
- (p8) add acc0 = pr0_3, acc1_1, 1
- (p9) add acc0 = pr0_3, acc1_1
- xma.hu fp1a_2 = u_2, v0, f0
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_0 = [up], 8
+ getf.sig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
;; C 05
-L(00):
- .pred.rel "mutex", p12, p13
- getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = u_1, v1, fp1a_1
- (p12) add s0 = pr1_2, acc0, 1
- (p13) add s0 = pr1_2, acc0
- xma.hu fp2a_1 = u_1, v1, fp1a_1
- nop 1
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, f0
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, f0
;; C 06
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
- ldf8 u_1 = [up], 8
- getfsig pr1_0 = fp1b_0
- (p8) cmp.leu p6, p7 = acc0, pr0_3
- (p9) cmp.ltu p6, p7 = acc0, pr0_3
- (p12) cmp.leu p10, p11 = s0, pr1_2
- (p13) cmp.ltu p10, p11 = s0, pr1_2
+.LL00:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
;; C 07
- .pred.rel "mutex", p6, p7
- getfsig acc1_0 = fp2a_0
- st8 [rp] = s0, 8
- xma.l fp0b_3 = u_3, v0, f0
- (p6) add acc0 = pr0_0, acc1_2, 1
- (p7) add acc0 = pr0_0, acc1_2
- xma.hu fp1a_3 = u_3, v0, f0
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_1 = [up], 8
+ getf.sig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
;; C 08
-L(11):
- .pred.rel "mutex", p10, p11
- getfsig pr0_2 = fp0b_2
- xma.l fp1b_2 = u_2, v1, fp1a_2
- (p10) add s0 = pr1_3, acc0, 1
- (p11) add s0 = pr1_3, acc0
- xma.hu fp2a_2 = u_2, v1, fp1a_2
- nop 1
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ xma.l fp0b_3 = u_3, v0, f0
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ xma.hu fp1a_3 = u_3, v0, f0
;; C 09
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
- ldf8 u_2 = [up], 8
- getfsig pr1_1 = fp1b_1
- (p6) cmp.leu p8, p9 = acc0, pr0_0
- (p7) cmp.ltu p8, p9 = acc0, pr0_0
- (p10) cmp.leu p12, p13 = s0, pr1_3
- (p11) cmp.ltu p12, p13 = s0, pr1_3
+.LL11:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
;; C 10
- .pred.rel "mutex", p8, p9
- getfsig acc1_1 = fp2a_1
- st8 [rp] = s0, 8
- xma.l fp0b_0 = u_0, v0, f0
- (p8) add acc0 = pr0_1, acc1_3, 1
- (p9) add acc0 = pr0_1, acc1_3
- xma.hu fp1a_0 = u_0, v0, f0
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_2 = [up], 8
+ getf.sig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
;; C 11
-L(10):
- .pred.rel "mutex", p12, p13
- getfsig pr0_3 = fp0b_3
- xma.l fp1b_3 = u_3, v1, fp1a_3
- (p12) add s0 = pr1_0, acc0, 1
- (p13) add s0 = pr1_0, acc0
- xma.hu fp2a_3 = u_3, v1, fp1a_3
- br.cloop.dptk L(top)
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ xma.l fp0b_0 = u_0, v0, f0
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ xma.hu fp1a_0 = u_0, v0, f0
+.LL10: br.cloop.dptk .Loop C 12
;;
C *** MAIN LOOP END ***
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_2 = fp1b_2
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_1
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
- ;;
- .pred.rel "mutex", p6, p7
-.mfi; getfsig acc1_2 = fp2a_2
- xma.l fp0b_1 = u_1, v0, f0
- nop 1
-.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- xma.hu fp1a_1 = u_1, v0, f0
- ;;
-L(cj5):
- .pred.rel "mutex", p10, p11
-.mfi; getfsig pr0_0 = fp0b_0
- xma.l fp1b_0 = u_0, v1, fp1a_0
- (p10) add s0 = pr1_1, acc0, 1
-.mfi; (p11) add s0 = pr1_1, acc0
- xma.hu fp2a_0 = u_0, v1, fp1a_0
- nop 1
- ;;
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
-.mmi; getfsig pr1_3 = fp1b_3
- st8 [rp] = s0, 8
- (p6) cmp.leu p8, p9 = acc0, pr0_2
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mfi; getfsig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, f0
- nop 1
-.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
- (p9) add acc0 = pr0_3, acc1_1
- xma.hu fp1a_2 = u_2, v0, f0
- ;;
-L(cj4):
- .pred.rel "mutex", p12, p13
-.mfi; getfsig pr0_1 = fp0b_1
- xma.l fp1b_1 = u_1, v1, fp1a_1
- (p12) add s0 = pr1_2, acc0, 1
-.mfi; (p13) add s0 = pr1_2, acc0
- xma.hu fp2a_1 = u_1, v1, fp1a_1
- nop 1
- ;;
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_0 = fp1b_0
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_3
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
- (p12) cmp.leu p10, p11 = s0, pr1_2
- (p13) cmp.ltu p10, p11 = s0, pr1_2
- ;;
- .pred.rel "mutex", p6, p7
-.mmi; getfsig acc1_0 = fp2a_0
- (p6) add acc0 = pr0_0, acc1_2, 1
- (p7) add acc0 = pr0_0, acc1_2
- ;;
-L(cj3):
- .pred.rel "mutex", p10, p11
-.mfi; getfsig pr0_2 = fp0b_2
- xma.l fp1b_2 = u_2, v1, fp1a_2
- (p10) add s0 = pr1_3, acc0, 1
-.mfi; (p11) add s0 = pr1_3, acc0
- xma.hu fp2a_2 = u_2, v1, fp1a_2
- nop 1
- ;;
- .pred.rel "mutex", p6, p7
- .pred.rel "mutex", p10, p11
-.mmi; getfsig pr1_1 = fp1b_1
- st8 [rp] = s0, 8
- (p6) cmp.leu p8, p9 = acc0, pr0_0
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
- (p10) cmp.leu p12, p13 = s0, pr1_3
- (p11) cmp.ltu p12, p13 = s0, pr1_3
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; getfsig acc1_1 = fp2a_1
- (p8) add acc0 = pr0_1, acc1_3, 1
- (p9) add acc0 = pr0_1, acc1_3
- ;;
- .pred.rel "mutex", p12, p13
-.mmi; (p12) add s0 = pr1_0, acc0, 1
- (p13) add s0 = pr1_0, acc0
- nop 1
- ;;
- .pred.rel "mutex", p8, p9
- .pred.rel "mutex", p12, p13
-.mmi; getfsig pr1_2 = fp1b_2
- st8 [rp] = s0, 8
- (p8) cmp.leu p6, p7 = acc0, pr0_1
-.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
- (p12) cmp.leu p10, p11 = s0, pr1_0
- (p13) cmp.ltu p10, p11 = s0, pr1_0
- ;;
- .pred.rel "mutex", p6, p7
-.mmi; getfsig r8 = fp2a_2
- (p6) add acc0 = pr0_2, acc1_0, 1
- (p7) add acc0 = pr0_2, acc1_0
- ;;
- .pred.rel "mutex", p10, p11
-.mmi; (p10) add s0 = pr1_1, acc0, 1
- (p11) add s0 = pr1_1, acc0
- (p6) cmp.leu p8, p9 = acc0, pr0_2
- ;;
- .pred.rel "mutex", p10, p11
-.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
- (p10) cmp.leu p12, p13 = s0, pr1_1
- (p11) cmp.ltu p12, p13 = s0, pr1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; st8 [rp] = s0, 8
- (p8) add acc0 = pr1_2, acc1_1, 1
- (p9) add acc0 = pr1_2, acc1_1
- ;;
- .pred.rel "mutex", p8, p9
-.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
- (p9) cmp.ltu p10, p11 = acc0, pr1_2
- (p12) add acc0 = 1, acc0
- ;;
-.mmi; st8 [rp] = acc0, 8
- (p12) cmpeqor p10, p0 = 0, acc0
- nop 1
- ;;
-.mib; (p10) add r8 = 1, r8
- mov ar.lc = r2
- br.ret.sptk.many b0
+.Lcj6:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_3 = fp0b_3
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, f0
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, f0
+ ;;
+.Lcj5:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ getf.sig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, f0
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, f0
+ ;;
+.Lcj4:
+ .pred.rel "mutex", p12, p13
+ getf.sig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ ;;
+.Lcj3:
+ .pred.rel "mutex", p10, p11
+ getf.sig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ getf.sig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;;
+ .pred.rel "mutex", p8, p9
+ getf.sig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ ;;
+ .pred.rel "mutex", p12, p13
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ ;;
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ getf.sig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+ .pred.rel "mutex", p6, p7
+ getf.sig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ ;;
+ .pred.rel "mutex", p10, p11
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ ;;
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ st8 [rp] = s0, 8
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) cmp.leu p10, p11 = acc0, pr1_2
+ (p9) cmp.ltu p10, p11 = acc0, pr1_2
+ (p12) add acc0 = 1, acc0
+ ;;
+ st8 [rp] = acc0, 8
+ (p12) cmp.eq.or p10, p0 = 0, acc0
+ mov r8 = acc1_2
+ ;;
+ .pred.rel "mutex", p10, p11
+ (p10) add r8 = 1, r8
+ mov.i ar.lc = r2
+ br.ret.sptk.many b0
EPILOGUE()
ASM_END()
diff --git a/gmp/mpn/ia64/popcount.asm b/gmp/mpn/ia64/popcount.asm
index c0b5c5c1cf..a02bf4346c 100644
--- a/gmp/mpn/ia64/popcount.asm
+++ b/gmp/mpn/ia64/popcount.asm
@@ -1,34 +1,22 @@
dnl IA-64 mpn_popcount -- mpn population count.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2005 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -50,7 +38,6 @@ PROLOGUE(mpn_popcount)
.prologue
ifdef(`HAVE_ABI_32',
` addp4 up = 0, up C M I
- nop.m 0
zxt4 n = n C I
;;
')
diff --git a/gmp/mpn/ia64/rsh1aors_n.asm b/gmp/mpn/ia64/rsh1aors_n.asm
index 3c7defb0ba..366b5c50bb 100644
--- a/gmp/mpn/ia64/rsh1aors_n.asm
+++ b/gmp/mpn/ia64/rsh1aors_n.asm
@@ -1,34 +1,21 @@
dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2003-2005 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -78,8 +65,6 @@ ifdef(`HAVE_ABI_32',`
addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
addp4 vp = 0, vp C M I
- nop.m 0
- nop.m 0
zxt4 n = n C I
;;
')
diff --git a/gmp/mpn/ia64/sec_tabselect.asm b/gmp/mpn/ia64/sec_tabselect.asm
deleted file mode 100644
index f116ea3843..0000000000
--- a/gmp/mpn/ia64/sec_tabselect.asm
+++ /dev/null
@@ -1,150 +0,0 @@
-dnl IA-64 mpn_sec_tabselect.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 2.5
-
-C NOTES
-C * Using software pipelining could trivially yield 2 c/l without unrolling,
-C or 1+epsilon with unrolling. (This code was modelled after the powerpc64
-C code, for simplicity.)
-
-C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
-define(`rp', `r32')
-define(`tp', `r33')
-define(`n', `r34')
-define(`nents', `r35')
-define(`which', `r36')
-
-define(`mask', `r8')
-
-define(`rp1', `r32')
-define(`tp1', `r33')
-define(`rp2', `r14')
-define(`tp2', `r15')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- .prologue
- .save ar.lc, r2
- .body
-ifdef(`HAVE_ABI_32',`
-.mmi; addp4 rp = 0, rp C M I
- addp4 tp = 0, tp C M I
- zxt4 n = n C I
-.mii; nop 0
- zxt4 nents = nents C I
- zxt4 which = which C I
- ;;
-')
-.mmi; add rp2 = 8, rp1
- add tp2 = 8, tp1
- add r6 = -2, n
- ;;
-.mmi; cmp.eq p10, p0 = 1, n
- and r9 = 1, n C set cr0 for use in inner loop
- shr.u r6 = r6, 1 C inner loop count
- ;;
-.mmi; cmp.eq p8, p0 = 0, r9
- sub which = nents, which
- shl n = n, 3
- ;;
-
-L(outer):
-.mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry?
- nop 0
- mov ar.lc = r6 C I0
- ;;
-.mmb;
- (p6) mov mask = -1
- (p7) mov mask = 0
- (p8) br.dptk L(top) C branch to loop entry if n even
- ;;
-
-.mmi; ld8 r16 = [tp1], 8
- add tp2 = 8, tp2
- nop 0
- ;;
-.mmi; ld8 r18 = [rp1]
- and r16 = r16, mask
- nop 0
- ;;
-.mmi; andcm r18 = r18, mask
- ;;
- or r16 = r16, r18
- nop 0
- ;;
-.mmb; st8 [rp1] = r16, 8
- add rp2 = 8, rp2
- (p10) br.dpnt L(end)
-
- ALIGN(32)
-L(top):
-.mmi; ld8 r16 = [tp1], 16
- ld8 r17 = [tp2], 16
- nop 0
- ;;
-.mmi; ld8 r18 = [rp1]
- and r16 = r16, mask
- nop 0
-.mmi; ld8 r19 = [rp2]
- and r17 = r17, mask
- nop 0
- ;;
-.mmi; andcm r18 = r18, mask
- andcm r19 = r19, mask
- nop 0
- ;;
-.mmi; or r16 = r16, r18
- or r17 = r17, r19
- nop 0
- ;;
-.mmb; st8 [rp1] = r16, 16
- st8 [rp2] = r17, 16
- br.cloop.dptk L(top)
- ;;
-L(end):
-.mmi; sub rp1 = rp1, n C move rp back to beginning
- sub rp2 = rp2, n C move rp back to beginning
- cmp.ne p9, p0 = 1, nents
-.mmb; add nents = -1, nents
- nop 0
- (p9) br.dptk L(outer)
- ;;
-
-.mib; nop 0
- nop 0
- br.ret.sptk.many b0
-EPILOGUE()
diff --git a/gmp/mpn/ia64/sqr_diag_addlsh1.asm b/gmp/mpn/ia64/sqr_diag_addlsh1.asm
deleted file mode 100644
index f9288298b3..0000000000
--- a/gmp/mpn/ia64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl IA-64 mpn_sqr_diag_addlsh1
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C Itanium: ?
-C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon
-
-C Exact performance table. The 2nd line is this code, the 3rd line is ctop-
-C less code. In an assembly sqr_basecase, the ctop-full numbers will become a
-C few cycles better since we can mitigate the many I0 instructions.
-C
-C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
-C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating
-C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43
-
-C We should keep in mind that this code takes linear time in a O(n^2) context
-C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
-C around 60. Keeping overhead down for smallish operands (< 10) is more
-C important than optimal cycle counts.
-
-C TODO
-C * Make sure we don't depend on uninitialised r-registers, f-registers, or
-C * p-registers.
-C * Optimise by doing first two loop iterations in function header.
-
-C INPUT PARAMETERS
-define(`rp_param', `r32') define(`rp', `r14') C size: 2n
-define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2
-define(`up_param', `r34') define(`up', `r31') C size: n
-define(`n', `r35')
-
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
-
- .prologue
- .save ar.pfs, r2
- .save ar.lc, r3
- .body
-
-.mmi; alloc r2 = ar.pfs, 4,24,0,24 C M
- nop 4711
- mov r3 = ar.lc C I0
-.mmi; mov tp = tp_param C M I
- mov up = up_param C M I
- mov rp = rp_param C M I
- ;;
-.mmi; ld8 r36 = [tp], 8 C M
- add r20 = -2, n C M I
- mov r9 = ar.ec C I0
- ;;
-.mmi; ld8 r32 = [tp], 8 C M
- mov r16 = 0 C M I
- mov ar.ec = 7 C I0
- ;;
-.mmi; nop 4711
- mov r44 = 0 C M I
- mov ar.lc = r20 C I0
- ;;
-.mii; mov r33 = 0
- mov r10 = pr C I0
- mov pr.rot = 0x30000 C I0
- ;;
- br.cexit.spnt.few.clr L(end)
-
-dnl *** MAIN LOOP START ***
- ALIGN(32)
-L(top):
-.mfi; (p18) ldf8 f33 = [up], 8 C M
- (p20) xma.l f36 = f35, f35, f42 C F
- (p41) cmpequc p50, p0 = -1, r44 C M I
-.mfi; setfsig f40 = r16 C M23
- (p20) xma.hu f38 = f35, f35, f42 C F
- (p23) add r50 = r41, r49 C M I
- ;;
-.mmi; (p16) ld8 r36 = [tp], 8 C M
- (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I
- (p19) shrp r45 = r38, r35, 63 C non-critical I0
-.mmi; (p21) getfsig r39 = f39 C hi M2
- (p24) st8 [rp] = r51, 8 C hi M23
- (p41) add r44 = 1, r44 C M I
- ;;
-.mmi; (p16) ld8 r32 = [tp], 8 C M
- (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I
- (p17) shrp r16 = r33, r37, 63 C critical I0
-.mmi; (p21) getfsig r42 = f37 C lo M2
- (p23) st8 [rp] = r44, 8 C lo M23
- (p50) add r50 = 1, r50 C M I
- ;;
- br.ctop.sptk.few.clr L(top) C B
-dnl *** MAIN LOOP END ***
- ;;
-L(end):
-.mmi; nop 4711
- (p41) add r44 = 1, r44 C M I
- shr.u r48 = r39, 63 C I0
- ;;
-.mmi; st8 [rp] = r51, 8 C M23
- (p41) cmpequc p6, p0 = 0, r44 C M I
- add r50 = r41, r48 C M I
- ;;
-.mmi; st8 [rp] = r44, 8 C M23
- (p6) add r50 = 1, r50 C M I
- mov ar.lc = r3 C I0
- ;;
-.mii; st8 [rp] = r50 C M23
- mov ar.ec = r9 C I0
- mov pr = r10 C I0
- ;;
-.mib; nop 4711
- mov ar.pfs = r2 C I0
- br.ret.sptk.many b0 C B
-EPILOGUE()
diff --git a/gmp/mpn/ia64/sqr_diagonal.asm b/gmp/mpn/ia64/sqr_diagonal.asm
new file mode 100644
index 0000000000..50307d4bb5
--- /dev/null
+++ b/gmp/mpn/ia64/sqr_diagonal.asm
@@ -0,0 +1,79 @@
+dnl IA-64 mpn_sqr_diagonal. Helper for sqr_basecase.
+
+dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 4
+C Itanium 2: 2
+
+C TODO
+C * Perhaps avoid ctop loop. Unfortunately, a cloop loop running at 1 c/l
+C would need prohibitive 8-way unrolling.
+C * Instead of messing too much with this, write a nifty mpn_sqr_basecase.
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ .prologue
+ .save ar.lc, r2
+ .save pr, r15
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32
+ addp4 r33 = 0, r33
+ zxt4 r34 = r34
+ ;;
+')
+ ldf8 f32 = [r33], 8 C M load rp[0] early
+ mov r2 = ar.lc C I0
+ mov r14 = ar.ec C I0
+ mov r15 = pr C I0
+ add r19 = -1, r34 C M I decr n
+ add r18 = 8, r32 C M I rp for high limb
+ ;;
+ mov ar.lc = r19 C I0
+ mov ar.ec = 5 C I0
+ mov pr.rot = 1<<16 C I0
+ ;;
+ br.cexit.spnt .Ldone C B
+ ;;
+ ALIGN(32)
+.Loop:
+ (p16) ldf8 f32 = [r33], 8 C M
+ (p19) xma.l f36 = f35, f35, f0 C F
+ (p21) stf8 [r32] = f38, 16 C M2 M3
+ (p19) xma.hu f40 = f35, f35, f0 C F
+ (p21) stf8 [r18] = f42, 16 C M2 M3
+ br.ctop.dptk .Loop C B
+ ;;
+.Ldone:
+ stf8 [r32] = f38 C M2 M3
+ stf8 [r18] = f42 C M2 M3
+ mov ar.ec = r14 C I0
+ ;;
+ mov pr = r15, 0x1ffff C I0
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE(mpn_sqr_diagonal)
+ASM_END()
diff --git a/gmp/mpn/ia64/submul_1.asm b/gmp/mpn/ia64/submul_1.asm
index cb2a5525b5..ae46e55d75 100644
--- a/gmp/mpn/ia64/submul_1.asm
+++ b/gmp/mpn/ia64/submul_1.asm
@@ -1,35 +1,22 @@
dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
dnl result from a second limb vector.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2000-2004 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/lisp/gmpasm-mode.el b/gmp/mpn/lisp/gmpasm-mode.el
index 06b74bd6ce..31a9b48cbe 100644
--- a/gmp/mpn/lisp/gmpasm-mode.el
+++ b/gmp/mpn/lisp/gmpasm-mode.el
@@ -1,33 +1,22 @@
;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
-;; Copyright 1999-2002 Free Software Foundation, Inc.
-
-;; This file is part of the GNU MP Library.
-;;
-;; The GNU MP Library is free software; you can redistribute it and/or modify
-;; it under the terms of either:
-;;
-;; * the GNU Lesser General Public License as published by the Free
-;; Software Foundation; either version 3 of the License, or (at your
-;; option) any later version.
-;;
-;; or
-;;
-;; * the GNU General Public License as published by the Free Software
-;; Foundation; either version 2 of the License, or (at your option) any
-;; later version.
-;;
-;; or both in parallel, as here.
-;;
-;; The GNU MP Library is distributed in the hope that it will be useful, but
-;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-;; for more details.
-;;
-;; You should have received copies of the GNU General Public License and the
-;; GNU Lesser General Public License along with the GNU MP Library. If not,
-;; see https://www.gnu.org/licenses/.
+;; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+;;
+;; This file is part of the GNU MP Library.
+;;
+;; The GNU MP Library is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU Lesser General Public License as published by
+;; the Free Software Foundation; either version 3 of the License, or (at your
+;; option) any later version.
+;;
+;; The GNU MP Library is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU Lesser General Public License
+;; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
;;; Commentary:
diff --git a/gmp/mpn/m4-ccas b/gmp/mpn/m4-ccas
index 16d80c6f51..984e8e9b0e 100755
--- a/gmp/mpn/m4-ccas
+++ b/gmp/mpn/m4-ccas
@@ -4,31 +4,20 @@
# Copyright 2001 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: m4-ccas --m4=M4 CC ... file.asm ...
diff --git a/gmp/mpn/m68k/README b/gmp/mpn/m68k/README
index 5261564df2..8838f8d41f 100644
--- a/gmp/mpn/m68k/README
+++ b/gmp/mpn/m68k/README
@@ -3,28 +3,17 @@ Copyright 2001, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/m68k/aors_n.asm b/gmp/mpn/m68k/aors_n.asm
index f7d379ec01..da9bb415b2 100644
--- a/gmp/mpn/m68k/aors_n.asm
+++ b/gmp/mpn/m68k/aors_n.asm
@@ -1,32 +1,22 @@
dnl mc68020 mpn_add_n, mpn_sub_n -- add or subtract limb vectors
-dnl Copyright 1992, 1994, 1996, 1999-2003, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002, 2003, 2005 Free
+dnl Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/m68k/gmp-mparam.h b/gmp/mpn/m68k/gmp-mparam.h
index 9ac7b41019..c623046535 100644
--- a/gmp/mpn/m68k/gmp-mparam.h
+++ b/gmp/mpn/m68k/gmp-mparam.h
@@ -1,47 +1,36 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2000-2004 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* 25MHz 68040 */
/* Generated by tuneup.c, 2004-02-05, gcc 3.2 */
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 90
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 90
#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 28
+#define SQR_KARATSUBA_THRESHOLD 28
#define SQR_TOOM3_THRESHOLD 98
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/gmp/mpn/m68k/lshift.asm b/gmp/mpn/m68k/lshift.asm
index f202abfe43..9d7a5ed0f6 100644
--- a/gmp/mpn/m68k/lshift.asm
+++ b/gmp/mpn/m68k/lshift.asm
@@ -1,32 +1,22 @@
dnl mc68020 mpn_lshift -- mpn left shift.
-dnl Copyright 1996, 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1996, 1999, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -114,14 +104,14 @@ ifelse(scale_available_p,1,`
bcs L(L1)
subql #1, s_size
-L(Loop):
+L(Loop:)
movel M(-,s_ptr), d2
movel d2, d3
lsrl d5, d3
orl d3, d1
movel d1, M(-,res_ptr)
lsll cnt, d2
-L(L1):
+L(L1:)
movel M(-,s_ptr), d1
movel d1, d3
lsrl d5, d3
@@ -133,7 +123,7 @@ L(L1):
subl #0x10000, s_size
bcc L(Loop)
-L(Lend):
+L(Lend:)
movel d1, M(-,res_ptr) C store least significant limb
C Restore used registers from stack frame.
diff --git a/gmp/mpn/m68k/m68k-defs.m4 b/gmp/mpn/m68k/m68k-defs.m4
index 15289f676f..17a345998a 100644
--- a/gmp/mpn/m68k/m68k-defs.m4
+++ b/gmp/mpn/m68k/m68k-defs.m4
@@ -2,33 +2,22 @@ divert(-1)
dnl m4 macros for 68k assembler.
-dnl Copyright 2001-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The default m4 `#' commenting interferes with the assembler syntax for
diff --git a/gmp/mpn/m68k/mc68020/aorsmul_1.asm b/gmp/mpn/m68k/mc68020/aorsmul_1.asm
index 4ee30ad9b3..17866602f8 100644
--- a/gmp/mpn/m68k/mc68020/aorsmul_1.asm
+++ b/gmp/mpn/m68k/mc68020/aorsmul_1.asm
@@ -1,32 +1,22 @@
dnl mc68020 mpn_addmul_1, mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 1992, 1994, 1996, 1999-2002, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -55,7 +45,6 @@ define(s1_ptr, `a1')
define(s1_size, `d2')
define(s2_limb, `d4')
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
PROLOGUE(M4_function_1)
diff --git a/gmp/mpn/m68k/mc68020/mul_1.asm b/gmp/mpn/m68k/mc68020/mul_1.asm
index f5fbb3063b..d24f6d1d9a 100644
--- a/gmp/mpn/m68k/mc68020/mul_1.asm
+++ b/gmp/mpn/m68k/mc68020/mul_1.asm
@@ -1,32 +1,22 @@
dnl mc68020 mpn_mul_1 -- mpn by limb multiply
-dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/m68k/mc68020/udiv.asm b/gmp/mpn/m68k/mc68020/udiv.asm
index aadeab999a..ebc1ef26db 100644
--- a/gmp/mpn/m68k/mc68020/udiv.asm
+++ b/gmp/mpn/m68k/mc68020/udiv.asm
@@ -1,32 +1,21 @@
dnl mc68020 mpn_udiv_qrnnd -- 2x1 limb division
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/m68k/mc68020/umul.asm b/gmp/mpn/m68k/mc68020/umul.asm
index f19314e9bb..4d6e8a8eb8 100644
--- a/gmp/mpn/m68k/mc68020/umul.asm
+++ b/gmp/mpn/m68k/mc68020/umul.asm
@@ -1,32 +1,21 @@
dnl mc68020 mpn_umul_ppmm -- limb by limb multiplication
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/m68k/rshift.asm b/gmp/mpn/m68k/rshift.asm
index 21b5f89f48..1bf58ac310 100644
--- a/gmp/mpn/m68k/rshift.asm
+++ b/gmp/mpn/m68k/rshift.asm
@@ -1,32 +1,22 @@
dnl mc68020 mpn_rshift -- mpn right shift.
-dnl Copyright 1996, 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1996, 1999, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -89,7 +79,7 @@ ifelse(scale_available_p,1,`
cmpl s_ptr, a2
bls L(Lspecial) C jump if s_ptr >= res_ptr + s_size
-L(Lnormal):
+L(Lnormal:)
moveql #32, d5
subl cnt, d5
movel M(s_ptr,+), d2
@@ -104,14 +94,14 @@ L(Lnormal):
bcs L(L1)
subql #1, s_size
-L(Loop):
+L(Loop:)
movel M(s_ptr,+), d2
movel d2, d3
lsll d5, d3
orl d3, d1
movel d1, M(res_ptr,+)
lsrl cnt, d2
-L(L1):
+L(L1:)
movel M(s_ptr,+), d1
movel d1, d3
lsll d5, d3
@@ -123,7 +113,7 @@ L(L1):
subl #0x10000, s_size
bcc L(Loop)
-L(Lend):
+L(Lend:)
movel d1, M(res_ptr) C store most significant limb
C Restore used registers from stack frame.
@@ -134,7 +124,7 @@ C We loop from most significant end of the arrays, which is only permissable
C if the source and destination don't overlap, since the function is
C documented to work for overlapping source and destination.
-L(Lspecial):
+L(Lspecial:)
ifelse(scale_available_p,1,`
lea M(s_ptr,s_size,l,4), s_ptr
lea M(res_ptr,s_size,l,4), res_ptr
@@ -151,11 +141,11 @@ ifelse(scale_available_p,1,`
bcc L(LL1)
subql #1, s_size
-L(LLoop):
+L(LLoop:)
movel M(-,s_ptr), d2
roxrl #1, d2
movel d2, M(-,res_ptr)
-L(LL1):
+L(LL1:)
movel M(-,s_ptr), d2
roxrl #1, d2
movel d2, M(-,res_ptr)
@@ -167,7 +157,7 @@ L(LL1):
addl d0, d0 C restore cy
bra L(LLoop)
-L(LLend):
+L(LLend:)
C Restore used registers from stack frame.
moveml M(sp,+), d2-d6/a2
rts
diff --git a/gmp/mpn/m68k/t-m68k-defs.pl b/gmp/mpn/m68k/t-m68k-defs.pl
index 91c21fa1f8..226afc5449 100644
--- a/gmp/mpn/m68k/t-m68k-defs.pl
+++ b/gmp/mpn/m68k/t-m68k-defs.pl
@@ -2,31 +2,20 @@
# Copyright 2001, 2003 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: perl t-m68k-defs.pl [-t]
diff --git a/gmp/mpn/m88k/README b/gmp/mpn/m88k/README
index 1b51e83079..046e3bf19a 100644
--- a/gmp/mpn/m88k/README
+++ b/gmp/mpn/m88k/README
@@ -3,28 +3,17 @@ Copyright 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/m88k/add_n.s b/gmp/mpn/m88k/add_n.s
index dbdb22f888..db2fffff3e 100644
--- a/gmp/mpn/m88k/add_n.s
+++ b/gmp/mpn/m88k/add_n.s
@@ -3,31 +3,20 @@
; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/mc88110/add_n.S b/gmp/mpn/m88k/mc88110/add_n.S
index c3b12b3cd0..3b627c01a7 100644
--- a/gmp/mpn/m88k/mc88110/add_n.S
+++ b/gmp/mpn/m88k/mc88110/add_n.S
@@ -3,31 +3,20 @@
; Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/mc88110/addmul_1.s b/gmp/mpn/m88k/mc88110/addmul_1.s
index 321221f23c..f41283395d 100644
--- a/gmp/mpn/m88k/mc88110/addmul_1.s
+++ b/gmp/mpn/m88k/mc88110/addmul_1.s
@@ -3,31 +3,20 @@
; Copyright 1996, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/mc88110/mul_1.s b/gmp/mpn/m88k/mc88110/mul_1.s
index 28fd14b77b..e8e88790a7 100644
--- a/gmp/mpn/m88k/mc88110/mul_1.s
+++ b/gmp/mpn/m88k/mc88110/mul_1.s
@@ -3,31 +3,20 @@
; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/mc88110/sub_n.S b/gmp/mpn/m88k/mc88110/sub_n.S
index f0a8ecb3f0..a21a2cc0c0 100644
--- a/gmp/mpn/m88k/mc88110/sub_n.S
+++ b/gmp/mpn/m88k/mc88110/sub_n.S
@@ -3,31 +3,20 @@
; Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/mul_1.s b/gmp/mpn/m88k/mul_1.s
index c8abdc0b7f..5c385bd351 100644
--- a/gmp/mpn/m88k/mul_1.s
+++ b/gmp/mpn/m88k/mul_1.s
@@ -3,31 +3,20 @@
; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/m88k/sub_n.s b/gmp/mpn/m88k/sub_n.s
index 2bd8f09ca3..9ea78ff3a1 100644
--- a/gmp/mpn/m88k/sub_n.s
+++ b/gmp/mpn/m88k/sub_n.s
@@ -3,31 +3,20 @@
; Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
-; This file is part of the GNU MP Library.
-;
-; The GNU MP Library is free software; you can redistribute it and/or modify
-; it under the terms of either:
-;
-; * the GNU Lesser General Public License as published by the Free
-; Software Foundation; either version 3 of the License, or (at your
-; option) any later version.
-;
-; or
-;
-; * the GNU General Public License as published by the Free Software
-; Foundation; either version 2 of the License, or (at your option) any
-; later version.
-;
-; or both in parallel, as here.
-;
-; The GNU MP Library is distributed in the hope that it will be useful, but
-; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-; for more details.
-;
-; You should have received copies of the GNU General Public License and the
-; GNU Lesser General Public License along with the GNU MP Library. If not,
-; see https://www.gnu.org/licenses/.
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 3 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
; INPUT PARAMETERS
diff --git a/gmp/mpn/minithres/gmp-mparam.h b/gmp/mpn/minithres/gmp-mparam.h
index 1b8f311516..cf5970b7d1 100644
--- a/gmp/mpn/minithres/gmp-mparam.h
+++ b/gmp/mpn/minithres/gmp-mparam.h
@@ -1,109 +1,64 @@
/* Minimal values gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000, 2006, 2008-2010, 2012 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2006, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* The values in this file are not currently minimal.
Trimming them further would be good. */
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 3
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 4
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 3
-
-#define MUL_TOOM22_THRESHOLD 8
-#define MUL_TOOM33_THRESHOLD 20
-#define MUL_TOOM44_THRESHOLD 24
-#define MUL_TOOM6H_THRESHOLD 70 /* FIXME */
-#define MUL_TOOM8H_THRESHOLD 86
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 50 /* FIXME */
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 50 /* FIXME */
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 50 /* FIXME */
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 /* FIXME */
-
-#define SQR_BASECASE_THRESHOLD 0
-#define SQR_TOOM2_THRESHOLD 8
-#define SQR_TOOM3_THRESHOLD 20
-#define SQR_TOOM4_THRESHOLD 24
-#define SQR_TOOM6H_THRESHOLD 70 /* FIXME */
-#define SQR_TOOM8H_THRESHOLD 86
-
-#define MULMOD_BNM1_THRESHOLD 10
-#define SQRMOD_BNM1_THRESHOLD 10
-
-#define MUL_FFT_TABLE {64, 256, 1024, 4096, 8192, 65536, 0}
+#define MUL_KARATSUBA_THRESHOLD 8
+#define MUL_TOOM3_THRESHOLD 20
+#define MUL_TOOM44_THRESHOLD 24
+
+#define SQR_BASECASE_THRESHOLD 0
+#define SQR_KARATSUBA_THRESHOLD 8
+#define SQR_TOOM3_THRESHOLD 20
+#define SQR_TOOM4_THRESHOLD 24
+
+#define MULLOW_BASECASE_THRESHOLD 0
+#define MULLOW_DC_THRESHOLD 2
+#define MULLOW_MUL_N_THRESHOLD 4
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 6
+#define POWM_THRESHOLD 4
+
+#define HGCD_THRESHOLD 10
+#define GCD_DC_THRESHOLD 20
+#define GCDEXT_SCHOENHAGE_THRESHOLD 20
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 4
+#define GET_STR_PRECOMPUTE_THRESHOLD 10
+#define SET_STR_THRESHOLD 64
+#define SET_STR_PRECOMPUTE_THRESHOLD 100
+
+#define MUL_FFT_TABLE {64-1, 256-1, 1024-1, 4096-1, 8192-1, 65536-1, 0}
#define MUL_FFT_MODF_THRESHOLD 65
#define MUL_FFT_THRESHOLD 200
-#define SQR_FFT_TABLE {64, 256, 1024, 4096, 8192, 65536, 0}
+#define SQR_FFT_TABLE {64-1, 256-1, 1024-1, 4096-1, 8192-1, 65536-1, 0}
#define SQR_FFT_MODF_THRESHOLD 65
#define SQR_FFT_THRESHOLD 200
-
-#define MULLO_BASECASE_THRESHOLD 0
-#define MULLO_DC_THRESHOLD 2
-#define MULLO_MUL_N_THRESHOLD 4
-
-#define DC_DIV_QR_THRESHOLD 6
-#define DC_DIVAPPR_Q_THRESHOLD 6
-#define DC_BDIV_QR_THRESHOLD 4
-#define DC_BDIV_Q_THRESHOLD 4
-
-#define INV_MULMOD_BNM1_THRESHOLD 2
-#define INV_NEWTON_THRESHOLD 6
-#define INV_APPR_THRESHOLD 4
-
-#define BINV_NEWTON_THRESHOLD 6
-#define REDC_1_TO_REDC_N_THRESHOLD 9
-
-#define MU_DIV_QR_THRESHOLD 8
-#define MU_DIVAPPR_Q_THRESHOLD 8
-#define MUPI_DIV_QR_THRESHOLD 8
-#define MU_BDIV_QR_THRESHOLD 8
-#define MU_BDIV_Q_THRESHOLD 8
-
-#define MATRIX22_STRASSEN_THRESHOLD 2
-#define HGCD_THRESHOLD 10
-#define GCD_DC_THRESHOLD 20
-#define GCDEXT_SCHOENHAGE_THRESHOLD 20
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 4
-#define GET_STR_PRECOMPUTE_THRESHOLD 10
-#define SET_STR_THRESHOLD 64
-#define SET_STR_PRECOMPUTE_THRESHOLD 100
-
-#define FAC_ODD_THRESHOLD 0 /* always */
-#define FAC_DSC_THRESHOLD 70
diff --git a/gmp/mpn/mips32/add_n.asm b/gmp/mpn/mips32/add_n.asm
index e7d4c48f48..f7dc7efab9 100644
--- a/gmp/mpn/mips32/add_n.asm
+++ b/gmp/mpn/mips32/add_n.asm
@@ -4,30 +4,19 @@ dnl sum in a third limb vector.
dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/addmul_1.asm b/gmp/mpn/mips32/addmul_1.asm
index 9aa9e163ce..f43e3c638b 100644
--- a/gmp/mpn/mips32/addmul_1.asm
+++ b/gmp/mpn/mips32/addmul_1.asm
@@ -4,30 +4,19 @@ dnl the product to a second limb vector.
dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/gmp-mparam.h b/gmp/mpn/mips32/gmp-mparam.h
index 986135df96..d86fd3f019 100644
--- a/gmp/mpn/mips32/gmp-mparam.h
+++ b/gmp/mpn/mips32/gmp-mparam.h
@@ -1,45 +1,35 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* Generated by tuneup.c, 2002-02-20, gcc 2.95 (R3000) */
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 50
+#define MUL_KARATSUBA_THRESHOLD 20
+#define MUL_TOOM3_THRESHOLD 50
#define SQR_BASECASE_THRESHOLD 7
-#define SQR_TOOM2_THRESHOLD 57
+#define SQR_KARATSUBA_THRESHOLD 57
#define SQR_TOOM3_THRESHOLD 78
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/mips32/lshift.asm b/gmp/mpn/mips32/lshift.asm
index 6a58bb4579..8a27951775 100644
--- a/gmp/mpn/mips32/lshift.asm
+++ b/gmp/mpn/mips32/lshift.asm
@@ -3,30 +3,19 @@ dnl MIPS32 mpn_lshift -- Left shift.
dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/mips-defs.m4 b/gmp/mpn/mips32/mips-defs.m4
index 5fa89eca35..a30e8df090 100644
--- a/gmp/mpn/mips32/mips-defs.m4
+++ b/gmp/mpn/mips32/mips-defs.m4
@@ -3,33 +3,22 @@ divert(-1)
dnl m4 macros for MIPS assembly code (both 32-bit and 64-bit).
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Usage: ASM_START()
diff --git a/gmp/mpn/mips32/mips.m4 b/gmp/mpn/mips32/mips.m4
index 8b49e575e4..37c6ca8f72 100644
--- a/gmp/mpn/mips32/mips.m4
+++ b/gmp/mpn/mips32/mips.m4
@@ -3,33 +3,22 @@ divert(-1)
dnl m4 macros for MIPS assembly code.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Usage: ASM_START()
diff --git a/gmp/mpn/mips32/mul_1.asm b/gmp/mpn/mips32/mul_1.asm
index 4337bc2bd4..1e1a275f66 100644
--- a/gmp/mpn/mips32/mul_1.asm
+++ b/gmp/mpn/mips32/mul_1.asm
@@ -4,30 +4,19 @@ dnl the product in a second limb vector.
dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/rshift.asm b/gmp/mpn/mips32/rshift.asm
index 4b54510408..23d1e780e6 100644
--- a/gmp/mpn/mips32/rshift.asm
+++ b/gmp/mpn/mips32/rshift.asm
@@ -3,30 +3,19 @@ dnl MIPS32 mpn_rshift -- Right shift.
dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/sub_n.asm b/gmp/mpn/mips32/sub_n.asm
index a962ce1b79..ed41271676 100644
--- a/gmp/mpn/mips32/sub_n.asm
+++ b/gmp/mpn/mips32/sub_n.asm
@@ -4,30 +4,19 @@ dnl store difference in a third limb vector.
dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/submul_1.asm b/gmp/mpn/mips32/submul_1.asm
index 335722b4e5..4e43654e0a 100644
--- a/gmp/mpn/mips32/submul_1.asm
+++ b/gmp/mpn/mips32/submul_1.asm
@@ -4,30 +4,19 @@ dnl subtract the product from a second limb vector.
dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips32/umul.asm b/gmp/mpn/mips32/umul.asm
index 1ced0eb883..04ecbe5095 100644
--- a/gmp/mpn/mips32/umul.asm
+++ b/gmp/mpn/mips32/umul.asm
@@ -3,30 +3,19 @@ dnl MIPS32 umul_ppmm -- longlong.h support.
dnl Copyright 1999, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/README b/gmp/mpn/mips64/README
index 7ddd0e572c..65a1af1668 100644
--- a/gmp/mpn/mips64/README
+++ b/gmp/mpn/mips64/README
@@ -3,28 +3,17 @@ Copyright 1996 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -41,7 +30,7 @@ RELEVANT OPTIMIZATION ISSUES
On the R4600, branches takes a single cycle
- On the R8000, branches often take no noticeable cycles, as they are
+ On the R8000, branches often take no noticable cycles, as they are
executed in a separate function unit..
2. The R4000 and R4400 have a load latency of 4 cycles.
diff --git a/gmp/mpn/mips64/add_n.asm b/gmp/mpn/mips64/add_n.asm
index 6856407efd..1a3978c3f9 100644
--- a/gmp/mpn/mips64/add_n.asm
+++ b/gmp/mpn/mips64/add_n.asm
@@ -1,33 +1,22 @@
dnl MIPS64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
dnl sum in a third limb vector.
-dnl Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -38,17 +27,6 @@ C s2_ptr $6
C size $7
ASM_START()
-PROLOGUE(mpn_add_nc)
- ld $10,0($5)
- ld $11,0($6)
-
- daddiu $7,$7,-1
- and $9,$7,4-1 C number of limbs in first loop
- beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop
- move $2,$8
- b .Loop0
- dsubu $7,$7,$9
-EPILOGUE()
PROLOGUE(mpn_add_n)
ld $10,0($5)
ld $11,0($6)
@@ -131,4 +109,4 @@ PROLOGUE(mpn_add_n)
sd $11,0($4)
j $31
or $2,$2,$8
-EPILOGUE()
+EPILOGUE(mpn_add_n)
diff --git a/gmp/mpn/mips64/addmul_1.asm b/gmp/mpn/mips64/addmul_1.asm
index 8ff0976e25..a116298a76 100644
--- a/gmp/mpn/mips64/addmul_1.asm
+++ b/gmp/mpn/mips64/addmul_1.asm
@@ -1,33 +1,23 @@
dnl MIPS64 mpn_addmul_1 -- Multiply a limb vector with a single limb and add
dnl the product to a second limb vector.
-dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1995, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/gmp-mparam.h b/gmp/mpn/mips64/gmp-mparam.h
index b7fcf24a41..d189e895c5 100644
--- a/gmp/mpn/mips64/gmp-mparam.h
+++ b/gmp/mpn/mips64/gmp-mparam.h
@@ -1,45 +1,35 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* Generated by tuneup.c, 2004-02-10, gcc 3.2 & MIPSpro C 7.2.1 (R1x000) */
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 89
+#define MUL_KARATSUBA_THRESHOLD 16
+#define MUL_TOOM3_THRESHOLD 89
#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 32
+#define SQR_KARATSUBA_THRESHOLD 32
#define SQR_TOOM3_THRESHOLD 98
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/mips64/lshift.asm b/gmp/mpn/mips64/lshift.asm
index 3440eaf80b..16da93c5ab 100644
--- a/gmp/mpn/mips64/lshift.asm
+++ b/gmp/mpn/mips64/lshift.asm
@@ -1,32 +1,21 @@
dnl MIPS64 mpn_lshift -- Left shift.
-dnl Copyright 1995, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/mul_1.asm b/gmp/mpn/mips64/mul_1.asm
index 77acf0ac25..d16e08d594 100644
--- a/gmp/mpn/mips64/mul_1.asm
+++ b/gmp/mpn/mips64/mul_1.asm
@@ -1,33 +1,23 @@
dnl MIPS64 mpn_mul_1 -- Multiply a limb vector with a single limb and store
dnl the product in a second limb vector.
-dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1995, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/rshift.asm b/gmp/mpn/mips64/rshift.asm
index 9253cb51d8..5294875621 100644
--- a/gmp/mpn/mips64/rshift.asm
+++ b/gmp/mpn/mips64/rshift.asm
@@ -1,32 +1,21 @@
dnl MIPS64 mpn_rshift -- Right shift.
-dnl Copyright 1995, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/sqr_diagonal.asm b/gmp/mpn/mips64/sqr_diagonal.asm
index dcb87dc21f..511a7552c9 100644
--- a/gmp/mpn/mips64/sqr_diagonal.asm
+++ b/gmp/mpn/mips64/sqr_diagonal.asm
@@ -3,30 +3,19 @@ dnl MIPS64 mpn_sqr_diagonal.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/mips64/sub_n.asm b/gmp/mpn/mips64/sub_n.asm
index 6a698976eb..b28c1ced9c 100644
--- a/gmp/mpn/mips64/sub_n.asm
+++ b/gmp/mpn/mips64/sub_n.asm
@@ -1,33 +1,22 @@
dnl MIPS64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
dnl store difference in a third limb vector.
-dnl Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -38,17 +27,6 @@ C s2_ptr $6
C size $7
ASM_START()
-PROLOGUE(mpn_sub_nc)
- ld $10,0($5)
- ld $11,0($6)
-
- daddiu $7,$7,-1
- and $9,$7,4-1 C number of limbs in first loop
- beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop
- move $2,$8
- b .Loop0
- dsubu $7,$7,$9
-EPILOGUE()
PROLOGUE(mpn_sub_n)
ld $10,0($5)
ld $11,0($6)
@@ -131,4 +109,4 @@ PROLOGUE(mpn_sub_n)
sd $11,0($4)
j $31
or $2,$2,$8
-EPILOGUE()
+EPILOGUE(mpn_sub_n)
diff --git a/gmp/mpn/mips64/submul_1.asm b/gmp/mpn/mips64/submul_1.asm
index 089589cd73..11e17370c0 100644
--- a/gmp/mpn/mips64/submul_1.asm
+++ b/gmp/mpn/mips64/submul_1.asm
@@ -1,33 +1,23 @@
dnl MIPS64 mpn_submul_1 -- Multiply a limb vector with a single limb and
dnl subtract the product from a second limb vector.
-dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1995, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/mips64/umul.asm b/gmp/mpn/mips64/umul.asm
index b9aac57591..1792d97fdb 100644
--- a/gmp/mpn/mips64/umul.asm
+++ b/gmp/mpn/mips64/umul.asm
@@ -3,30 +3,19 @@ dnl MIPS64 umul_ppmm -- longlong.h support.
dnl Copyright 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/ns32k/add_n.s b/gmp/mpn/ns32k/add_n.s
new file mode 100644
index 0000000000..962cc1657b
--- /dev/null
+++ b/gmp/mpn/ns32k/add_n.s
@@ -0,0 +1,44 @@
+# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+ .align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ save [r3,r4,r5]
+ negd 28(sp),r3
+ movd r3,r0
+ lshd 2,r0
+ movd 24(sp),r4
+ subd r0,r4 # r4 -> to end of S2
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r2
+ subd r0,r2 # r2 -> to end of RES
+ subd r0,r0 # cy = 0
+
+Loop: movd r5[r3:d],r0
+ addcd r4[r3:d],r0
+ movd r0,r2[r3:d]
+ acbd 1,r3,Loop
+
+ scsd r0 # r0 = cy.
+ restore [r5,r4,r3]
+ ret 0
diff --git a/gmp/mpn/ns32k/addmul_1.s b/gmp/mpn/ns32k/addmul_1.s
new file mode 100644
index 0000000000..1dd8791be3
--- /dev/null
+++ b/gmp/mpn/ns32k/addmul_1.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+ .align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ addcd 0,r0
+ addd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/gmp/mpn/ns32k/mul_1.s b/gmp/mpn/ns32k/mul_1.s
new file mode 100644
index 0000000000..abc911e0c9
--- /dev/null
+++ b/gmp/mpn/ns32k/mul_1.s
@@ -0,0 +1,45 @@
+# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+ .align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ movd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/gmp/mpn/ns32k/sub_n.s b/gmp/mpn/ns32k/sub_n.s
new file mode 100644
index 0000000000..5252ddf5c6
--- /dev/null
+++ b/gmp/mpn/ns32k/sub_n.s
@@ -0,0 +1,44 @@
+# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+ .align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ save [r3,r4,r5]
+ negd 28(sp),r3
+ movd r3,r0
+ lshd 2,r0
+ movd 24(sp),r4
+ subd r0,r4 # r4 -> to end of S2
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r2
+ subd r0,r2 # r2 -> to end of RES
+ subd r0,r0 # cy = 0
+
+Loop: movd r5[r3:d],r0
+ subcd r4[r3:d],r0
+ movd r0,r2[r3:d]
+ acbd 1,r3,Loop
+
+ scsd r0 # r0 = cy.
+ restore [r5,r4,r3]
+ ret 0
diff --git a/gmp/mpn/ns32k/submul_1.s b/gmp/mpn/ns32k/submul_1.s
new file mode 100644
index 0000000000..7a0ba9a73c
--- /dev/null
+++ b/gmp/mpn/ns32k/submul_1.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+ .align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+ save [r3,r4,r5,r6,r7]
+ negd 24(sp),r4
+ movd r4,r0
+ lshd 2,r0
+ movd 20(sp),r5
+ subd r0,r5 # r5 -> to end of S1
+ movd 16(sp),r6
+ subd r0,r6 # r6 -> to end of RES
+ subd r0,r0 # r0 = 0, cy = 0
+ movd 28(sp),r7 # r7 = s2_limb
+
+Loop: movd r5[r4:d],r2
+ meid r7,r2 # r2 = low_prod, r3 = high_prod
+ addcd r0,r2 # r2 = low_prod + cy_limb
+ movd r3,r0 # r0 = new cy_limb
+ addcd 0,r0
+ subd r2,r6[r4:d]
+ acbd 1,r4,Loop
+
+ addcd 0,r0
+ restore [r7,r6,r5,r4,r3]
+ ret 0
diff --git a/gmp/mpn/pa32/README b/gmp/mpn/pa32/README
index 4323390c9b..72158d30ea 100644
--- a/gmp/mpn/pa32/README
+++ b/gmp/mpn/pa32/README
@@ -3,28 +3,17 @@ Copyright 1996, 1999, 2001, 2002, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/pa32/add_n.asm b/gmp/mpn/pa32/add_n.asm
index 46f39377ea..1bb27ae883 100644
--- a/gmp/mpn/pa32/add_n.asm
+++ b/gmp/mpn/pa32/add_n.asm
@@ -1,33 +1,22 @@
dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
dnl sum in a third limb vector.
-dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/gmp-mparam.h b/gmp/mpn/pa32/gmp-mparam.h
index 377efcb156..005539c0d7 100644
--- a/gmp/mpn/pa32/gmp-mparam.h
+++ b/gmp/mpn/pa32/gmp-mparam.h
@@ -1,61 +1,53 @@
/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2012 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* These values are for the PA7100 using GCC. */
/* Generated by tuneup.c, 2000-10-27. */
-#ifndef MUL_TOOM22_THRESHOLD
-#define MUL_TOOM22_THRESHOLD 30
+#ifndef MUL_KARATSUBA_THRESHOLD
+#define MUL_KARATSUBA_THRESHOLD 30
#endif
-#ifndef MUL_TOOM33_THRESHOLD
-#define MUL_TOOM33_THRESHOLD 141
+#ifndef MUL_TOOM3_THRESHOLD
+#define MUL_TOOM3_THRESHOLD 141
#endif
-#ifndef SQR_TOOM2_THRESHOLD
-#define SQR_TOOM2_THRESHOLD 59
+#ifndef SQR_KARATSUBA_THRESHOLD
+#define SQR_KARATSUBA_THRESHOLD 59
#endif
#ifndef SQR_TOOM3_THRESHOLD
#define SQR_TOOM3_THRESHOLD 177
#endif
#ifndef DIV_DC_THRESHOLD
-#define DIV_DC_THRESHOLD 108
+#define DIV_DC_THRESHOLD 108
#endif
#ifndef POWM_THRESHOLD
#define POWM_THRESHOLD 18
#endif
-#ifndef GCDEXT_THRESHOLD
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
#define GCDEXT_THRESHOLD 33
#endif
diff --git a/gmp/mpn/pa32/hppa1_1/addmul_1.asm b/gmp/mpn/pa32/hppa1_1/addmul_1.asm
index ec2f2198e8..c50e4e10f7 100644
--- a/gmp/mpn/pa32/hppa1_1/addmul_1.asm
+++ b/gmp/mpn/pa32/hppa1_1/addmul_1.asm
@@ -1,33 +1,23 @@
dnl HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
-dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/gmp-mparam.h b/gmp/mpn/pa32/hppa1_1/gmp-mparam.h
index 1261b24c83..5ced745486 100644
--- a/gmp/mpn/pa32/hppa1_1/gmp-mparam.h
+++ b/gmp/mpn/pa32/hppa1_1/gmp-mparam.h
@@ -1,43 +1,33 @@
/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* Generated by tuneup.c, 2004-02-07, gcc 2.8 (pa7100/100MHz) */
-#define MUL_TOOM22_THRESHOLD 30
-#define MUL_TOOM33_THRESHOLD 89
+#define MUL_KARATSUBA_THRESHOLD 30
+#define MUL_TOOM3_THRESHOLD 89
#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 55
+#define SQR_KARATSUBA_THRESHOLD 55
#define SQR_TOOM3_THRESHOLD 101
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/pa32/hppa1_1/mul_1.asm b/gmp/mpn/pa32/hppa1_1/mul_1.asm
index 6e60c2f61f..9e17c2d023 100644
--- a/gmp/mpn/pa32/hppa1_1/mul_1.asm
+++ b/gmp/mpn/pa32/hppa1_1/mul_1.asm
@@ -1,33 +1,23 @@
dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
dnl result in a second limb vector.
-dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm b/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm
index b96d403826..326a133984 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm
@@ -2,33 +2,23 @@ dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
dnl sum in a third limb vector. Optimized for the PA7100, where is runs at
dnl 4.25 cycles/limb.
-dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
index fb16100d83..57f4d76745 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
@@ -1,33 +1,22 @@
dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and
dnl add the result to a second limb vector.
-dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm b/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm
index d65db2a76b..f6b4068cfa 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm
@@ -1,33 +1,23 @@
dnl HP-PA mpn_lshift -- Shift a number left.
dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
-dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm b/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm
index f7896fc949..ed7313b5fc 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm
@@ -1,33 +1,23 @@
dnl HP-PA mpn_rshift -- Shift a number right.
dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
-dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm
index df3f6e8b81..38ea0e197e 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm
@@ -2,33 +2,23 @@ dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
dnl store difference in a third limb vector. Optimized for the PA7100, where
dnl is runs at 4.25 cycles/limb.
-dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002, 2003 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm
index 5ea08cbee5..aee9d9033e 100644
--- a/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm
+++ b/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm
@@ -1,33 +1,22 @@
dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1995, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm b/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm
index 1c7a18e37d..4eba989276 100644
--- a/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm
+++ b/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm
@@ -3,30 +3,19 @@ dnl HP-PA 1.1 32-bit mpn_sqr_diagonal.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/submul_1.asm b/gmp/mpn/pa32/hppa1_1/submul_1.asm
index a9b11d24a8..c6bc38394b 100644
--- a/gmp/mpn/pa32/hppa1_1/submul_1.asm
+++ b/gmp/mpn/pa32/hppa1_1/submul_1.asm
@@ -1,33 +1,23 @@
dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
dnl the result from a second limb vector.
-dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/udiv.asm b/gmp/mpn/pa32/hppa1_1/udiv.asm
index 626ecd202b..e6a9927edf 100644
--- a/gmp/mpn/pa32/hppa1_1/udiv.asm
+++ b/gmp/mpn/pa32/hppa1_1/udiv.asm
@@ -4,30 +4,19 @@ dnl This version runs fast on PA 7000 and later.
dnl Copyright 1993, 1994, 2000, 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa1_1/umul.asm b/gmp/mpn/pa32/hppa1_1/umul.asm
index 18b923cd5a..7f1cb93949 100644
--- a/gmp/mpn/pa32/hppa1_1/umul.asm
+++ b/gmp/mpn/pa32/hppa1_1/umul.asm
@@ -1,30 +1,19 @@
dnl Copyright 1999, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa2_0/add_n.asm b/gmp/mpn/pa32/hppa2_0/add_n.asm
index 8d881b8b08..685c4c91ae 100644
--- a/gmp/mpn/pa32/hppa2_0/add_n.asm
+++ b/gmp/mpn/pa32/hppa2_0/add_n.asm
@@ -1,33 +1,22 @@
dnl HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0
dnl and store sum in a third limb vector.
-dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa2_0/gmp-mparam.h b/gmp/mpn/pa32/hppa2_0/gmp-mparam.h
index 6016274714..c356b4acce 100644
--- a/gmp/mpn/pa32/hppa2_0/gmp-mparam.h
+++ b/gmp/mpn/pa32/hppa2_0/gmp-mparam.h
@@ -1,167 +1,73 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2009, 2010 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2009 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 552 MHz PA8600 (gcc61.fsffrance.org) */
-
-#define DIVREM_1_NORM_THRESHOLD 3
-#define DIVREM_1_UNNORM_THRESHOLD 3
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 28
-#define USE_PREINV_DIVREM_1 1
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 36
-
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 202
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
-
-#define SQR_BASECASE_THRESHOLD 7
-#define SQR_TOOM2_THRESHOLD 55
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 250
-#define SQR_TOOM6_THRESHOLD 306
-#define SQR_TOOM8_THRESHOLD 527
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 244, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \
- { 7, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \
- { 15, 7}, { 33, 8}, { 23, 9}, { 15, 8}, \
- { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47,10}, \
- { 31, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 135, 8}, { 271, 9}, { 143,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \
- { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \
- { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \
- { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 351, 9}, { 703, 8}, { 1407,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223, 9}, \
- { 895,10}, { 479,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 351,10}, { 703, 9}, { 1407,12}, \
- { 191,11}, { 415,10}, { 831,11}, { 479,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 107
-#define MUL_FFT_THRESHOLD 2112
-
-#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 240, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \
- { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 9}, { 15, 8}, \
- { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \
- { 63, 9}, { 47,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 7}, { 511, 9}, { 135, 8}, { 271, 9}, \
- { 143,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
- { 175, 8}, { 351, 7}, { 703,10}, { 95, 9}, \
- { 191, 8}, { 383, 9}, { 207,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \
- { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 351, 9}, { 703, 8}, { 1407,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223, 8}, \
- { 1791,10}, { 479, 9}, { 959,12}, { 127,11}, \
- { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 415,10}, { 831,11}, \
- { 479,10}, { 959,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 109
-#define SQR_FFT_THRESHOLD 1600
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 116
-#define MULLO_MUL_N_THRESHOLD 3574
-
-#define DC_DIV_QR_THRESHOLD 100
-#define DC_DIVAPPR_Q_THRESHOLD 348
-#define DC_BDIV_QR_THRESHOLD 109
-#define DC_BDIV_Q_THRESHOLD 254
-
-#define INV_MULMOD_BNM1_THRESHOLD 34
-#define INV_NEWTON_THRESHOLD 276
-#define INV_APPR_THRESHOLD 276
-
-#define BINV_NEWTON_THRESHOLD 278
-#define REDC_1_TO_REDC_N_THRESHOLD 78
-
-#define MU_DIV_QR_THRESHOLD 979
-#define MU_DIVAPPR_Q_THRESHOLD 263
-#define MUPI_DIV_QR_THRESHOLD 102
-#define MU_BDIV_QR_THRESHOLD 807
-#define MU_BDIV_Q_THRESHOLD 1187
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 100
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 249
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 7
-#define GET_STR_PRECOMPUTE_THRESHOLD 16
-#define SET_STR_DC_THRESHOLD 270
-#define SET_STR_PRECOMPUTE_THRESHOLD 782
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2009-03-05, gcc 4.3 */
+
+#define MUL_KARATSUBA_THRESHOLD 15
+#define MUL_TOOM3_THRESHOLD 98
+#define MUL_TOOM44_THRESHOLD 158
+
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_KARATSUBA_THRESHOLD 48
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 232
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 92
+#define MULLOW_MUL_N_THRESHOLD 363
+
+#define DIV_SB_PREINV_THRESHOLD 4
+#define DIV_DC_THRESHOLD 92
+#define POWM_THRESHOLD 142
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 100
+#define GCD_DC_THRESHOLD 365
+#define GCDEXT_DC_THRESHOLD 339
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 3
+#define DIVREM_1_UNNORM_THRESHOLD 5
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1_1_THRESHOLD 6
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 24
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define GET_STR_DC_THRESHOLD 8
+#define GET_STR_PRECOMPUTE_THRESHOLD 13
+#define SET_STR_DC_THRESHOLD 224
+#define SET_STR_PRECOMPUTE_THRESHOLD 702
+
+#define MUL_FFT_TABLE { 272, 672, 896, 2560, 6144, 24576, 98304, 393216, 0 }
+#define MUL_FFT_MODF_THRESHOLD 232
+#define MUL_FFT_THRESHOLD 1792
+
+#define SQR_FFT_TABLE { 304, 672, 1152, 2560, 10240, 24576, 98304, 393216, 0 }
+#define SQR_FFT_MODF_THRESHOLD 232
+#define SQR_FFT_THRESHOLD 1792
diff --git a/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm b/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm
index c55112fac5..3493c87322 100644
--- a/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm
+++ b/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm
@@ -3,30 +3,19 @@ dnl HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/hppa2_0/sub_n.asm b/gmp/mpn/pa32/hppa2_0/sub_n.asm
index 47b3163fe3..b0aefb4abb 100644
--- a/gmp/mpn/pa32/hppa2_0/sub_n.asm
+++ b/gmp/mpn/pa32/hppa2_0/sub_n.asm
@@ -1,33 +1,22 @@
dnl HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same
dnl length > 0 and store difference in a third limb vector.
-dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/lshift.asm b/gmp/mpn/pa32/lshift.asm
index 5ea497c1f1..2128fbeed4 100644
--- a/gmp/mpn/pa32/lshift.asm
+++ b/gmp/mpn/pa32/lshift.asm
@@ -1,32 +1,21 @@
dnl HP-PA mpn_lshift -- Shift a number left.
-dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/pa-defs.m4 b/gmp/mpn/pa32/pa-defs.m4
index b26e715fc5..affaef897a 100644
--- a/gmp/mpn/pa32/pa-defs.m4
+++ b/gmp/mpn/pa32/pa-defs.m4
@@ -3,37 +3,26 @@ divert(-1)
dnl m4 macros for HPPA assembler.
dnl Copyright 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl hppa assembler comments are introduced with ";".
dnl
-dnl For cooperation with cpp, apparently lines "# 123" set the line number,
+dnl For cooperation with cpp, aparently lines "# 123" set the line number,
dnl and other lines starting with a "#" are ignored.
changecom(;)
diff --git a/gmp/mpn/pa32/rshift.asm b/gmp/mpn/pa32/rshift.asm
index c5eac830c9..238b0be7ed 100644
--- a/gmp/mpn/pa32/rshift.asm
+++ b/gmp/mpn/pa32/rshift.asm
@@ -1,32 +1,21 @@
dnl HP-PA mpn_rshift -- Shift a number right.
-dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/sub_n.asm b/gmp/mpn/pa32/sub_n.asm
index 9c71655b98..d07ebb5bb6 100644
--- a/gmp/mpn/pa32/sub_n.asm
+++ b/gmp/mpn/pa32/sub_n.asm
@@ -1,33 +1,22 @@
dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
dnl store difference in a third limb vector.
-dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa32/udiv.asm b/gmp/mpn/pa32/udiv.asm
index addbf41ef5..86886e484c 100644
--- a/gmp/mpn/pa32/udiv.asm
+++ b/gmp/mpn/pa32/udiv.asm
@@ -1,33 +1,22 @@
dnl HP-PA __udiv_qrnnd division support, used from longlong.h.
dnl This version runs fast on pre-PA7000 CPUs.
-dnl Copyright 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+dnl Copyright 1993, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/README b/gmp/mpn/pa64/README
index a51ce028a4..6234a407f2 100644
--- a/gmp/mpn/pa64/README
+++ b/gmp/mpn/pa64/README
@@ -3,28 +3,17 @@ Copyright 1999, 2001, 2002, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/pa64/add_n.asm b/gmp/mpn/pa64/add_n.asm
new file mode 100644
index 0000000000..1c2055590c
--- /dev/null
+++ b/gmp/mpn/pa64/add_n.asm
@@ -0,0 +1,93 @@
+dnl HP-PA 2.0 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+dnl This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500. It
+dnl should be possible to reach the cache bandwith 1.5 cycles/limb at least
+dnl with PA8500. The problem now is stalling of the first ADD,DC after LDO,
+dnl where the processor gets confused about where carry comes from.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+` .level 2.0w
+',` .level 2.0
+')
+PROLOGUE(mpn_add_n)
+ sub %r0, n, %r22
+ depw,z %r22, 30, 3, %r28 C r28 = 2 * (-n & 7)
+ depw,z %r22, 28, 3, %r22 C r22 = 8 * (-n & 7)
+ sub up, %r22, up C offset up
+ sub vp, %r22, vp C offset vp
+ sub rp, %r22, rp C offset rp
+ blr %r28, %r0 C branch into loop
+ add %r0, %r0, %r0 C reset carry
+
+LDEF(loop)
+ ldd 0(up), %r20
+ ldd 0(vp), %r31
+ add,dc %r20, %r31, %r20
+ std %r20, 0(rp)
+LDEF(7) ldd 8(up), %r21
+ ldd 8(vp), %r19
+ add,dc %r21, %r19, %r21
+ std %r21, 8(rp)
+LDEF(6) ldd 16(up), %r20
+ ldd 16(vp), %r31
+ add,dc %r20, %r31, %r20
+ std %r20, 16(rp)
+LDEF(5) ldd 24(up), %r21
+ ldd 24(vp), %r19
+ add,dc %r21, %r19, %r21
+ std %r21, 24(rp)
+LDEF(4) ldd 32(up), %r20
+ ldd 32(vp), %r31
+ add,dc %r20, %r31, %r20
+ std %r20, 32(rp)
+LDEF(3) ldd 40(up), %r21
+ ldd 40(vp), %r19
+ add,dc %r21, %r19, %r21
+ std %r21, 40(rp)
+LDEF(2) ldd 48(up), %r20
+ ldd 48(vp), %r31
+ add,dc %r20, %r31, %r20
+ std %r20, 48(rp)
+LDEF(1) ldd 56(up), %r21
+ ldd 56(vp), %r19
+ add,dc %r21, %r19, %r21
+ ldo 64(up), up
+ std %r21, 56(rp)
+ ldo 64(vp), vp
+ addib,> -8, n, L(loop)
+ ldo 64(rp), rp
+
+ add,dc %r0, %r0, %r29
+ bve (%r2)
+ifdef(`HAVE_ABI_2_0w',
+` copy %r29, %r28
+',` ldi 0, %r28
+')
+EPILOGUE(mpn_add_n)
diff --git a/gmp/mpn/pa64/addmul_1.asm b/gmp/mpn/pa64/addmul_1.asm
index 2cb9af9f14..4e76546050 100644
--- a/gmp/mpn/pa64/addmul_1.asm
+++ b/gmp/mpn/pa64/addmul_1.asm
@@ -1,33 +1,22 @@
dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
dnl add the result to a second limb vector.
-dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/aors_n.asm b/gmp/mpn/pa64/aors_n.asm
deleted file mode 100644
index ab4536fefb..0000000000
--- a/gmp/mpn/pa64/aors_n.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-dnl HP-PA 2.0 mpn_add_n, mpn_sub_n
-
-dnl Copyright 1997, 2000, 2002, 2003, 2009, 2010 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-dnl This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500. It
-dnl should be possible to reach the cache bandwidth 1.5 cycles/limb at least
-dnl with PA8500. The problem now is stalling of the first ADD,DC after LDO,
-dnl where the processor gets confused about where carry comes from.
-
-include(`../config.m4')
-
-dnl INPUT PARAMETERS
-define(`rp',`%r26')
-define(`up',`%r25')
-define(`vp',`%r24')
-define(`n',`%r23')
-
-ifdef(`OPERATION_add_n', `
- define(ADCSBC, `add,dc')
- define(INITCY, `addi -1,%r22,%r0')
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(ADCSBC, `sub,db')
- define(INITCY, `subi 0,%r22,%r0')
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ifdef(`HAVE_ABI_2_0w',
-` .level 2.0w
-',` .level 2.0
-')
-PROLOGUE(func_nc)
-ifdef(`HAVE_ABI_2_0w',
-` b L(com)
- nop
-',` b L(com)
- ldw -52(%r30), %r22
-')
-EPILOGUE()
-PROLOGUE(func)
- ldi 0, %r22
-LDEF(com)
- sub %r0, n, %r21
- depw,z %r21, 30, 3, %r28 C r28 = 2 * (-n & 7)
- depw,z %r21, 28, 3, %r21 C r21 = 8 * (-n & 7)
- sub up, %r21, up C offset up
- sub vp, %r21, vp C offset vp
- sub rp, %r21, rp C offset rp
- blr %r28, %r0 C branch into loop
- INITCY
-
-LDEF(loop)
- ldd 0(up), %r20
- ldd 0(vp), %r31
- ADCSBC %r20, %r31, %r20
- std %r20, 0(rp)
-LDEF(7) ldd 8(up), %r21
- ldd 8(vp), %r19
- ADCSBC %r21, %r19, %r21
- std %r21, 8(rp)
-LDEF(6) ldd 16(up), %r20
- ldd 16(vp), %r31
- ADCSBC %r20, %r31, %r20
- std %r20, 16(rp)
-LDEF(5) ldd 24(up), %r21
- ldd 24(vp), %r19
- ADCSBC %r21, %r19, %r21
- std %r21, 24(rp)
-LDEF(4) ldd 32(up), %r20
- ldd 32(vp), %r31
- ADCSBC %r20, %r31, %r20
- std %r20, 32(rp)
-LDEF(3) ldd 40(up), %r21
- ldd 40(vp), %r19
- ADCSBC %r21, %r19, %r21
- std %r21, 40(rp)
-LDEF(2) ldd 48(up), %r20
- ldd 48(vp), %r31
- ADCSBC %r20, %r31, %r20
- std %r20, 48(rp)
-LDEF(1) ldd 56(up), %r21
- ldd 56(vp), %r19
- ADCSBC %r21, %r19, %r21
- ldo 64(up), up
- std %r21, 56(rp)
- ldo 64(vp), vp
- addib,> -8, n, L(loop)
- ldo 64(rp), rp
-
- add,dc %r0, %r0, %r29
-ifdef(`OPERATION_sub_n',`
- subi 1, %r29, %r29
-')
- bve (%r2)
-ifdef(`HAVE_ABI_2_0w',
-` copy %r29, %r28
-',` ldi 0, %r28
-')
-EPILOGUE()
diff --git a/gmp/mpn/pa64/aorslsh1_n.asm b/gmp/mpn/pa64/aorslsh1_n.asm
index 2a55ddea30..b2cca7a356 100644
--- a/gmp/mpn/pa64/aorslsh1_n.asm
+++ b/gmp/mpn/pa64/aorslsh1_n.asm
@@ -3,30 +3,19 @@ dnl PA64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
dnl Copyright 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/gmp-mparam.h b/gmp/mpn/pa64/gmp-mparam.h
index c2719c3c89..aa92cb9191 100644
--- a/gmp/mpn/pa64/gmp-mparam.h
+++ b/gmp/mpn/pa64/gmp-mparam.h
@@ -1,247 +1,72 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2010 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+The GNU MP Library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License along
+with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* 440MHz PA8200 */
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 10
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD 21
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 31
-#define MUL_TOOM33_THRESHOLD 114
-#define MUL_TOOM44_THRESHOLD 179
-#define MUL_TOOM6H_THRESHOLD 222
-#define MUL_TOOM8H_THRESHOLD 296
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 130
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 229
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 54
-
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 58
-#define SQR_TOOM3_THRESHOLD 153
-#define SQR_TOOM4_THRESHOLD 278
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 0 /* always */
-
-#define MULMID_TOOM42_THRESHOLD 56
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 19
-
-#define POWM_SEC_TABLE 2,23,228,1084
-
-#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,10}, \
- { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \
- { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
- { 63, 9}, { 127,10}, { 71, 8}, { 287,10}, \
- { 79,11}, { 47,10}, { 95, 9}, { 191, 8}, \
- { 383, 7}, { 767,10}, { 103, 9}, { 207, 8}, \
- { 415, 7}, { 831,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 543, 7}, { 1087, 6}, \
- { 2175,10}, { 143, 9}, { 287, 8}, { 575,11}, \
- { 79, 9}, { 319, 8}, { 639, 7}, { 1279, 9}, \
- { 335, 8}, { 671,10}, { 175, 9}, { 351, 8}, \
- { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \
- { 1663,11}, { 111,10}, { 223, 9}, { 447, 8}, \
- { 895,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 543, 8}, { 1087, 7}, { 2175,10}, { 287, 9}, \
- { 575, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \
- { 639, 8}, { 1279,10}, { 335, 9}, { 671, 8}, \
- { 1343, 9}, { 703, 8}, { 1407,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207, 9}, { 831, 8}, \
- { 1663,11}, { 223,10}, { 447, 9}, { 959,13}, \
- { 63,12}, { 127,11}, { 255, 8}, { 2047,11}, \
- { 271,10}, { 543, 9}, { 1087, 8}, { 2175,11}, \
- { 287,10}, { 575, 9}, { 1215, 8}, { 2431,11}, \
- { 319,10}, { 671, 9}, { 1343, 8}, { 2687,11}, \
- { 351,10}, { 703, 9}, { 1471, 8}, { 2943,12}, \
- { 191,11}, { 383, 8}, { 3071,11}, { 415,10}, \
- { 831, 9}, { 1663,11}, { 479,10}, { 959, 9}, \
- { 1919, 8}, { 3839,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \
- { 607,10}, { 1215, 9}, { 2431, 8}, { 4863,12}, \
- { 319,11}, { 671,10}, { 1343,13}, { 191, 9}, \
- { 3071,12}, { 415,11}, { 831,10}, { 1663, 8}, \
- { 6655, 9}, { 3455,12}, { 447, 9}, { 3583,13}, \
- { 255,12}, { 511,11}, { 1023,10}, { 2175,13}, \
- { 319,11}, { 1279,12}, { 671,10}, { 2815,12}, \
- { 735,10}, { 2943, 9}, { 5887,13}, { 383,12}, \
- { 767,11}, { 1535,10}, { 3071,13}, { 447,10}, \
- { 3583,12}, { 959,13}, { 511,12}, { 1087,13}, \
- { 639,12}, { 1343,13}, { 767,11}, { 3071,13}, \
- { 831,12}, { 1663,11}, { 3455,10}, { 6911,13}, \
- { 895,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1087,12}, { 2303,13}, { 1215,12}, { 2431,14}, \
- { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
- { 2687,11}, { 5375,13}, { 1407,12}, { 2815,11}, \
- { 5631,12}, { 2943,13}, { 1535,12}, { 3199,13}, \
- { 1663,12}, { 3327,13}, { 1727,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2047,12}, { 4095,14}, { 1151,13}, \
- { 2431,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2815,12}, { 5631,15}, { 767,14}, { 1535,13}, \
- { 3071,14}, { 1663,13}, { 3327,14}, { 1791,13}, \
- { 3583,14}, { 1919,15}, { 1023,14}, { 2303,13}, \
- { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 252
-#define MUL_FFT_THRESHOLD 2368
-
-#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 284, 5}, { 9, 4}, { 21, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 25, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 63, 8}, { 255, 7}, { 511,10}, \
- { 71, 8}, { 287, 7}, { 575,10}, { 79,11}, \
- { 47,10}, { 95, 9}, { 191, 8}, { 383, 7}, \
- { 767,10}, { 103, 9}, { 207, 8}, { 415,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 543, 7}, { 1087, 8}, { 575, 7}, { 1151,11}, \
- { 79, 8}, { 639, 7}, { 1279, 9}, { 335, 8}, \
- { 671, 7}, { 1343,10}, { 175, 8}, { 703, 7}, \
- { 1407,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \
- { 1663, 9}, { 447, 8}, { 895,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 543, 8}, { 1087, 7}, \
- { 2175, 9}, { 575, 8}, { 1151,10}, { 303, 9}, \
- { 607, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \
- { 639, 8}, { 1279, 9}, { 671, 8}, { 1343, 7}, \
- { 2687,10}, { 351, 9}, { 703, 8}, { 1407,12}, \
- { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
- { 207,10}, { 415, 9}, { 831, 8}, { 1663,11}, \
- { 223,10}, { 447, 9}, { 895,13}, { 63,11}, \
- { 255,10}, { 543, 8}, { 2175,11}, { 287,10}, \
- { 575, 9}, { 1151,10}, { 607, 9}, { 1215, 8}, \
- { 2431,11}, { 319, 9}, { 1279,10}, { 671, 9}, \
- { 1343, 8}, { 2687,11}, { 351,10}, { 703, 9}, \
- { 1407,10}, { 735,12}, { 191,11}, { 383,10}, \
- { 831, 9}, { 1663,12}, { 223,11}, { 447,10}, \
- { 895,11}, { 479, 9}, { 1919, 8}, { 3839,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
- { 1087, 9}, { 2175,12}, { 287,11}, { 575,10}, \
- { 1151,11}, { 607,10}, { 1215, 9}, { 2431, 8}, \
- { 4863,10}, { 1279,11}, { 671,10}, { 1343, 9}, \
- { 2687,12}, { 351,11}, { 703,10}, { 1407,11}, \
- { 735,13}, { 191, 9}, { 3071, 7}, { 12287,11}, \
- { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \
- { 447, 8}, { 7167,12}, { 479, 9}, { 3839,14}, \
- { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 543,10}, { 2175, 9}, { 4607,11}, { 1215,10}, \
- { 2431,11}, { 1279,10}, { 2559,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 799,10}, { 3199, 9}, \
- { 6399,12}, { 895,13}, { 511,12}, { 1023,11}, \
- { 2047,12}, { 1087,13}, { 575,12}, { 1151,10}, \
- { 4607,13}, { 639,12}, { 1279,11}, { 2687,14}, \
- { 383,13}, { 767,11}, { 3071,12}, { 1599,13}, \
- { 895,12}, { 1791,11}, { 3583,13}, { 959,15}, \
- { 255,12}, { 2175,13}, { 1215,14}, { 639,13}, \
- { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \
- { 1471,11}, { 5887,14}, { 767,13}, { 1535,12}, \
- { 3071,13}, { 1599,12}, { 3199,13}, { 1663,12}, \
- { 3327,13}, { 1727,14}, { 895,13}, { 1791,12}, \
- { 3583,15}, { 511,14}, { 1023,13}, { 2175,14}, \
- { 1151,12}, { 4607,13}, { 2431,14}, { 1279,13}, \
- { 2687,14}, { 1407,13}, { 2815,15}, { 767,13}, \
- { 3199,14}, { 1663,13}, { 3327,14}, { 1791,13}, \
- { 3583,14}, { 1919,15}, { 1023,14}, { 2047,13}, \
- { 4095,14}, { 2303,13}, { 4607,14}, { 2431,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 257
-#define SQR_FFT_THRESHOLD 1856
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 113
-#define MULLO_MUL_N_THRESHOLD 4658
-
-#define DC_DIV_QR_THRESHOLD 123
-#define DC_DIVAPPR_Q_THRESHOLD 372
-#define DC_BDIV_QR_THRESHOLD 142
-#define DC_BDIV_Q_THRESHOLD 312
-
-#define INV_MULMOD_BNM1_THRESHOLD 58
-#define INV_NEWTON_THRESHOLD 315
-#define INV_APPR_THRESHOLD 315
-
-#define BINV_NEWTON_THRESHOLD 360
-#define REDC_1_TO_REDC_N_THRESHOLD 101
-
-#define MU_DIV_QR_THRESHOLD 979
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 93
-#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1187
-
-#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 234
-#define HGCD_APPR_THRESHOLD 300
-#define HGCD_REDUCE_THRESHOLD 1553
-#define GCD_DC_THRESHOLD 684
-#define GCDEXT_DC_THRESHOLD 525
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 21
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 1951
-#define SET_STR_PRECOMPUTE_THRESHOLD 4034
+/* Generated by tuneup.c, 2009-01-04, system compiler */
+
+#define MUL_KARATSUBA_THRESHOLD 30
+#define MUL_TOOM3_THRESHOLD 114
+#define MUL_TOOM44_THRESHOLD 244
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_KARATSUBA_THRESHOLD 58
+#define SQR_TOOM3_THRESHOLD 174
+#define SQR_TOOM4_THRESHOLD 312
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 142
+#define MULLOW_MUL_N_THRESHOLD 507
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 124
+#define POWM_THRESHOLD 224
+
+#define MATRIX22_STRASSEN_THRESHOLD 11
+#define HGCD_THRESHOLD 294
+#define GCD_DC_THRESHOLD 913
+#define GCDEXT_DC_THRESHOLD 830
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 23
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 2743
+#define SET_STR_PRECOMPUTE_THRESHOLD 5147
+
+#define MUL_FFT_TABLE { 400, 800, 1600, 2816, 7168, 20480, 81920, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 280
+#define MUL_FFT_THRESHOLD 1664
+
+#define SQR_FFT_TABLE { 368, 800, 1728, 3328, 7168, 20480, 81920, 327680, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 264
+#define SQR_FFT_THRESHOLD 1632
diff --git a/gmp/mpn/pa64/lshift.asm b/gmp/mpn/pa64/lshift.asm
index c0fc2921c1..0dceba20c1 100644
--- a/gmp/mpn/pa64/lshift.asm
+++ b/gmp/mpn/pa64/lshift.asm
@@ -3,30 +3,19 @@ dnl HP-PA 2.0 mpn_lshift -- Left shift.
dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
diff --git a/gmp/mpn/pa64/mul_1.asm b/gmp/mpn/pa64/mul_1.asm
index 6935c23ccd..fbb5f174ae 100644
--- a/gmp/mpn/pa64/mul_1.asm
+++ b/gmp/mpn/pa64/mul_1.asm
@@ -1,33 +1,22 @@
dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
dnl the result in a second limb vector.
-dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/rshift.asm b/gmp/mpn/pa64/rshift.asm
index cfc242ea9c..80470c9892 100644
--- a/gmp/mpn/pa64/rshift.asm
+++ b/gmp/mpn/pa64/rshift.asm
@@ -3,30 +3,19 @@ dnl HP-PA 2.0 mpn_rshift -- Right shift.
dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
diff --git a/gmp/mpn/pa64/sqr_diagonal.asm b/gmp/mpn/pa64/sqr_diagonal.asm
index f6fadc93c6..73c64b06ed 100644
--- a/gmp/mpn/pa64/sqr_diagonal.asm
+++ b/gmp/mpn/pa64/sqr_diagonal.asm
@@ -1,32 +1,21 @@
dnl HP-PA 2.0 64-bit mpn_sqr_diagonal.
-dnl Copyright 2001-2003 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
diff --git a/gmp/mpn/pa64/sub_n.asm b/gmp/mpn/pa64/sub_n.asm
new file mode 100644
index 0000000000..8ad524da01
--- /dev/null
+++ b/gmp/mpn/pa64/sub_n.asm
@@ -0,0 +1,93 @@
+dnl HP-PA 2.0 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+dnl This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500. It
+dnl should be possible to reach the cache bandwith 1.5 cycles/limb at least
+dnl with PA8500. The problem now is stalling of the first SUB,DB after LDO,
+dnl where the processor gets confused about where carry comes from.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+` .level 2.0w
+',` .level 2.0
+')
+PROLOGUE(mpn_sub_n)
+ sub %r0, n, %r22
+ depw,z %r22, 30, 3, %r28 C r28 = 2 * (-n & 7)
+ depw,z %r22, 28, 3, %r22 C r22 = 8 * (-n & 7)
+ sub up, %r22, up C offset up
+ sub vp, %r22, vp C offset vp
+ blr %r28, %r0 C branch into loop
+ sub rp, %r22, rp C offset rp and set carry
+
+LDEF(loop)
+ ldd 0(up), %r20
+ ldd 0(vp), %r31
+ sub,db %r20, %r31, %r20
+ std %r20, 0(rp)
+LDEF(7) ldd 8(up), %r21
+ ldd 8(vp), %r19
+ sub,db %r21, %r19, %r21
+ std %r21, 8(rp)
+LDEF(6) ldd 16(up), %r20
+ ldd 16(vp), %r31
+ sub,db %r20, %r31, %r20
+ std %r20, 16(rp)
+LDEF(5) ldd 24(up), %r21
+ ldd 24(vp), %r19
+ sub,db %r21, %r19, %r21
+ std %r21, 24(rp)
+LDEF(4) ldd 32(up), %r20
+ ldd 32(vp), %r31
+ sub,db %r20, %r31, %r20
+ std %r20, 32(rp)
+LDEF(3) ldd 40(up), %r21
+ ldd 40(vp), %r19
+ sub,db %r21, %r19, %r21
+ std %r21, 40(rp)
+LDEF(2) ldd 48(up), %r20
+ ldd 48(vp), %r31
+ sub,db %r20, %r31, %r20
+ std %r20, 48(rp)
+LDEF(1) ldd 56(up), %r21
+ ldd 56(vp),%r19
+ sub,db %r21, %r19, %r21
+ ldo 64(up), up
+ std %r21, 56(rp)
+ ldo 64(vp), vp
+ addib,> -8, n, L(loop)
+ ldo 64(rp), rp
+
+ add,dc %r0, %r0, %r29
+ subi 1, %r29, %r29
+ bve (%r2)
+ifdef(`HAVE_ABI_2_0w',
+` copy %r29, %r28
+',` ldi 0, %r28
+')
+EPILOGUE(mpn_sub_n)
diff --git a/gmp/mpn/pa64/submul_1.asm b/gmp/mpn/pa64/submul_1.asm
index f8a1968e45..40678239fa 100644
--- a/gmp/mpn/pa64/submul_1.asm
+++ b/gmp/mpn/pa64/submul_1.asm
@@ -1,33 +1,22 @@
dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/udiv.asm b/gmp/mpn/pa64/udiv.asm
index 1380a85932..3775783a05 100644
--- a/gmp/mpn/pa64/udiv.asm
+++ b/gmp/mpn/pa64/udiv.asm
@@ -1,32 +1,21 @@
dnl HP-PA 2.0 64-bit mpn_udiv_qrnnd_r.
-dnl Copyright 2001-2003 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/pa64/umul.asm b/gmp/mpn/pa64/umul.asm
index c3341ecfe6..635e44fea0 100644
--- a/gmp/mpn/pa64/umul.asm
+++ b/gmp/mpn/pa64/umul.asm
@@ -1,36 +1,26 @@
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Optimizations:
dnl * Avoid skip instructions
dnl * Put carry-generating and carry-consuming insns consecutively
-dnl * Don't allocate any stack, "home" positions for parameters could be used.
+dnl * Don't allocate any stack, "home" positions for parameteters could be
+dnl used.
include(`../config.m4')
diff --git a/gmp/mpn/power/add_n.asm b/gmp/mpn/power/add_n.asm
index 6d6ca73da9..4fcafab7e1 100644
--- a/gmp/mpn/power/add_n.asm
+++ b/gmp/mpn/power/add_n.asm
@@ -1,32 +1,22 @@
dnl IBM POWER mpn_add_n -- Add two limb vectors of equal, non-zero length.
-dnl Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2001, 2005 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/addmul_1.asm b/gmp/mpn/power/addmul_1.asm
index 76d8df3c76..fcda2c1263 100644
--- a/gmp/mpn/power/addmul_1.asm
+++ b/gmp/mpn/power/addmul_1.asm
@@ -1,33 +1,22 @@
dnl IBM POWER mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
-dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1999, 2000, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/gmp-mparam.h b/gmp/mpn/power/gmp-mparam.h
index 7cb36f963e..f9b10e6a47 100644
--- a/gmp/mpn/power/gmp-mparam.h
+++ b/gmp/mpn/power/gmp-mparam.h
@@ -1,40 +1,29 @@
/* POWER gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2002-2004 Free Software Foundation, Inc.
+Copyright 2002, 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* Generated by tuneup.c, 2003-02-10, gcc 3.2, POWER2 66.7MHz */
-#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 75
+#define MUL_KARATSUBA_THRESHOLD 12
+#define MUL_TOOM3_THRESHOLD 75
#define SQR_BASECASE_THRESHOLD 7
-#define SQR_TOOM2_THRESHOLD 28
+#define SQR_KARATSUBA_THRESHOLD 28
#define SQR_TOOM3_THRESHOLD 86
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/gmp/mpn/power/lshift.asm b/gmp/mpn/power/lshift.asm
index efa210556d..a4adb7aad5 100644
--- a/gmp/mpn/power/lshift.asm
+++ b/gmp/mpn/power/lshift.asm
@@ -1,32 +1,21 @@
dnl IBM POWER mpn_lshift -- Shift a number left.
-dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1999, 2000, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/mul_1.asm b/gmp/mpn/power/mul_1.asm
index 38b7b66be0..bd33942adf 100644
--- a/gmp/mpn/power/mul_1.asm
+++ b/gmp/mpn/power/mul_1.asm
@@ -1,33 +1,22 @@
dnl IBM POWER mpn_mul_1 -- Multiply a limb vector with a limb and store the
dnl result in a second limb vector.
-dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1999, 2000, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/rshift.asm b/gmp/mpn/power/rshift.asm
index 1d1815ccb5..4645015ccd 100644
--- a/gmp/mpn/power/rshift.asm
+++ b/gmp/mpn/power/rshift.asm
@@ -1,32 +1,21 @@
dnl IBM POWER mpn_rshift -- Shift a number right.
-dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1999, 2000, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/sdiv.asm b/gmp/mpn/power/sdiv.asm
index 4a9ed143b8..7a798022cd 100644
--- a/gmp/mpn/power/sdiv.asm
+++ b/gmp/mpn/power/sdiv.asm
@@ -1,30 +1,19 @@
dnl Copyright 1999, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/power/sub_n.asm b/gmp/mpn/power/sub_n.asm
index 390c802d8b..d34415d7e4 100644
--- a/gmp/mpn/power/sub_n.asm
+++ b/gmp/mpn/power/sub_n.asm
@@ -1,33 +1,23 @@
dnl IBM POWER mpn_sub_n -- Subtract two limb vectors of equal, non-zero
dnl length.
-dnl Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2001, 2005 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/submul_1.asm b/gmp/mpn/power/submul_1.asm
index 1788e0d4f4..3c3492d00c 100644
--- a/gmp/mpn/power/submul_1.asm
+++ b/gmp/mpn/power/submul_1.asm
@@ -1,33 +1,22 @@
dnl IBM POWER mpn_submul_1 -- Multiply a limb vector with a limb and subtract
dnl the result from a second limb vector.
-dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+dnl Copyright 1992, 1994, 1999, 2000, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl INPUT PARAMETERS
diff --git a/gmp/mpn/power/umul.asm b/gmp/mpn/power/umul.asm
index 5a0599e21d..996f2e6cb1 100644
--- a/gmp/mpn/power/umul.asm
+++ b/gmp/mpn/power/umul.asm
@@ -1,30 +1,19 @@
dnl Copyright 1999, 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/750/com.asm b/gmp/mpn/powerpc32/750/com.asm
deleted file mode 100644
index 1b8b574b9c..0000000000
--- a/gmp/mpn/powerpc32/750/com.asm
+++ /dev/null
@@ -1,79 +0,0 @@
-dnl PowerPC 750 mpn_com -- mpn bitwise one's complement
-
-dnl Copyright 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C 603e: ?
-C 604e: 3.0
-C 75x (G3): 2.0
-C 7400,7410 (G4): 2.0
-C 744x,745x (G4+): 3.0
-
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C This loop form is necessary for the claimed speed.
-
-ASM_START()
-PROLOGUE(mpn_com)
-
- C r3 dst
- C r4 src
- C r5 size
-
- mtctr r5 C size
- lwz r5, 0(r4) C src low limb
-
- sub r4, r4, r3 C src-dst
- subi r3, r3, 4 C dst-4
-
- addi r4, r4, 8 C src-dst+8
- bdz L(one)
-
-L(top):
- C r3 &dst[i-1]
- C r4 src-dst
- C r5 src[i]
- C r6 scratch
-
- not r6, r5 C ~src[i]
- lwzx r5, r4,r3 C src[i+1]
-
- stwu r6, 4(r3) C dst[i]
- bdnz L(top)
-
-L(one):
- not r6, r5
-
- stw r6, 4(r3) C dst[size-1]
- blr
-
-EPILOGUE()
diff --git a/gmp/mpn/powerpc32/750/com_n.asm b/gmp/mpn/powerpc32/750/com_n.asm
new file mode 100644
index 0000000000..02fc4b6587
--- /dev/null
+++ b/gmp/mpn/powerpc32/750/com_n.asm
@@ -0,0 +1,68 @@
+dnl PowerPC 750 mpn_com_n -- mpn bitwise one's complement
+
+dnl Copyright 2002, 2003 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C 603e: ?
+C 604e: 3.0
+C 75x (G3): 2.0
+C 7400,7410 (G4): 2.0
+C 744x,745x (G4+): 3.0
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This loop form is necessary for the claimed speed.
+
+ASM_START()
+PROLOGUE(mpn_com_n)
+
+ C r3 dst
+ C r4 src
+ C r5 size
+
+ mtctr r5 C size
+ lwz r5, 0(r4) C src low limb
+
+ sub r4, r4, r3 C src-dst
+ subi r3, r3, 4 C dst-4
+
+ addi r4, r4, 8 C src-dst+8
+ bdz L(one)
+
+L(top):
+ C r3 &dst[i-1]
+ C r4 src-dst
+ C r5 src[i]
+ C r6 scratch
+
+ not r6, r5 C ~src[i]
+ lwzx r5, r4,r3 C src[i+1]
+
+ stwu r6, 4(r3) C dst[i]
+ bdnz L(top)
+
+L(one):
+ not r6, r5
+
+ stw r6, 4(r3) C dst[size-1]
+ blr
+
+EPILOGUE()
diff --git a/gmp/mpn/powerpc32/750/gmp-mparam.h b/gmp/mpn/powerpc32/750/gmp-mparam.h
index 3667e8596d..448f2676df 100644
--- a/gmp/mpn/powerpc32/750/gmp-mparam.h
+++ b/gmp/mpn/powerpc32/750/gmp-mparam.h
@@ -1,35 +1,24 @@
/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2002, 2004, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2002, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* This file is used for 75x (G3) and for 7400/7410 (G4), both which have
@@ -37,156 +26,49 @@ see https://www.gnu.org/licenses/. */
/* 450 MHz PPC 7400 */
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 38
-#define USE_PREINV_DIVREM_1 1
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 10
-#define MUL_TOOM33_THRESHOLD 38
-#define MUL_TOOM44_THRESHOLD 99
-#define MUL_TOOM6H_THRESHOLD 141
-#define MUL_TOOM8H_THRESHOLD 212
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
-
-#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 18
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 142
-#define SQR_TOOM6_THRESHOLD 173
-#define SQR_TOOM8_THRESHOLD 309
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 11
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 8, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
- { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \
- { 287, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \
- { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
- { 319, 9}, { 175, 8}, { 351, 7}, { 703,10}, \
- { 95, 9}, { 191, 8}, { 383, 9}, { 207,10}, \
- { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \
- { 415, 8}, { 831,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \
- { 703, 8}, { 1407,11}, { 191,10}, { 415, 9}, \
- { 831,11}, { 223,10}, { 447, 9}, { 895,12}, \
- { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
- { 575,11}, { 351,10}, { 703, 9}, { 1407,12}, \
- { 191,11}, { 415,10}, { 831,11}, { 447,10}, \
- { 895,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,11}, { 575,12}, { 319,11}, { 703,10}, \
- { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \
- { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
- { 703,11}, { 1407,13}, { 383,12}, { 895,11}, \
- { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1215,13}, { 639,12}, { 1407,13}, { 895,12}, \
- { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1151,12}, { 2303,13}, { 1407,14}, { 767,13}, \
- { 1919,10}, { 15359,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 154
-#define MUL_FFT_THRESHOLD 2688
-
-#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 31, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \
- { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \
- { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \
- { 287, 9}, { 79, 8}, { 159,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \
- { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \
- { 207,10}, { 111,11}, { 63,10}, { 127, 9}, \
- { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \
- { 159, 9}, { 319,10}, { 175, 9}, { 351,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \
- { 415, 8}, { 831,10}, { 223,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447, 9}, { 895,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 287,10}, { 575,11}, { 319,10}, \
- { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 447,10}, { 895,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 575,12}, \
- { 319,11}, { 703,10}, { 1407,12}, { 383,11}, \
- { 831,12}, { 447,11}, { 895,10}, { 1791,11}, \
- { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 575,11}, { 1215,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 895,11}, { 1791,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
- { 1471,13}, { 767,12}, { 1535,13}, { 895,12}, \
- { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \
- { 1407,14}, { 767,13}, { 1919,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 152
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 5240
-
-#define DC_DIV_QR_THRESHOLD 31
-#define DC_DIVAPPR_Q_THRESHOLD 108
-#define DC_BDIV_QR_THRESHOLD 35
-#define DC_BDIV_Q_THRESHOLD 88
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 149
-#define INV_APPR_THRESHOLD 125
-
-#define BINV_NEWTON_THRESHOLD 156
-#define REDC_1_TO_REDC_N_THRESHOLD 39
-
-#define MU_DIV_QR_THRESHOLD 807
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 66
-#define MU_BDIV_QR_THRESHOLD 667
-#define MU_BDIV_Q_THRESHOLD 807
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 87
-#define GCD_DC_THRESHOLD 233
-#define GCDEXT_DC_THRESHOLD 198
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 390
-#define SET_STR_PRECOMPUTE_THRESHOLD 814
+/* Generated by tuneup.c, 2008-12-23, gcc 4.0 */
+
+#define MUL_KARATSUBA_THRESHOLD 10
+#define MUL_TOOM3_THRESHOLD 41
+#define MUL_TOOM44_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_KARATSUBA_THRESHOLD 18
+#define SQR_TOOM3_THRESHOLD 57
+#define SQR_TOOM4_THRESHOLD 88
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 32
+#define MULLOW_MUL_N_THRESHOLD 194
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 33
+#define POWM_THRESHOLD 48
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 91
+#define GCD_DC_THRESHOLD 256
+#define GCDEXT_DC_THRESHOLD 256
+#define JACOBI_BASE_METHOD 1
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 390
+#define SET_STR_PRECOMPUTE_THRESHOLD 814
+
+#define MUL_FFT_TABLE { 240, 608, 896, 2560, 6144, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 232
+#define MUL_FFT_THRESHOLD 1792
+
+#define SQR_FFT_TABLE { 240, 544, 896, 2560, 6144, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD 216
+#define SQR_FFT_THRESHOLD 1792
diff --git a/gmp/mpn/powerpc32/750/lshift.asm b/gmp/mpn/powerpc32/750/lshift.asm
index 3a1c1a7212..9298793f27 100644
--- a/gmp/mpn/powerpc32/750/lshift.asm
+++ b/gmp/mpn/powerpc32/750/lshift.asm
@@ -3,30 +3,19 @@ dnl PowerPC 750 mpn_lshift -- mpn left shift.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/750/rshift.asm b/gmp/mpn/powerpc32/750/rshift.asm
index 4825fee618..944e8690a7 100644
--- a/gmp/mpn/powerpc32/750/rshift.asm
+++ b/gmp/mpn/powerpc32/750/rshift.asm
@@ -3,30 +3,19 @@ dnl PowerPC 750 mpn_rshift -- mpn right shift.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/README b/gmp/mpn/powerpc32/README
index 887e78b290..43aca466c2 100644
--- a/gmp/mpn/powerpc32/README
+++ b/gmp/mpn/powerpc32/README
@@ -3,28 +3,17 @@ Copyright 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/powerpc32/addlsh1_n.asm b/gmp/mpn/powerpc32/addlsh1_n.asm
index 71645c3ec3..db627a0a31 100644
--- a/gmp/mpn/powerpc32/addlsh1_n.asm
+++ b/gmp/mpn/powerpc32/addlsh1_n.asm
@@ -3,30 +3,19 @@ dnl PowerPC-32 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/addmul_1.asm b/gmp/mpn/powerpc32/addmul_1.asm
index 7f47ab2ce7..6260691b34 100644
--- a/gmp/mpn/powerpc32/addmul_1.asm
+++ b/gmp/mpn/powerpc32/addmul_1.asm
@@ -1,33 +1,23 @@
dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
-dnl Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1995, 1997, 1998, 2000, 2001, 2002, 2003, 2005 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/aix.m4 b/gmp/mpn/powerpc32/aix.m4
index fde20200b2..81199c78d4 100644
--- a/gmp/mpn/powerpc32/aix.m4
+++ b/gmp/mpn/powerpc32/aix.m4
@@ -1,33 +1,22 @@
divert(-1)
dnl m4 macros for AIX 32-bit assembly.
-dnl Copyright 2000-2002, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2005, 2006 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',
` .toc')
diff --git a/gmp/mpn/powerpc32/aors_n.asm b/gmp/mpn/powerpc32/aors_n.asm
index 25ece0966e..f9e9b50d52 100644
--- a/gmp/mpn/powerpc32/aors_n.asm
+++ b/gmp/mpn/powerpc32/aors_n.asm
@@ -3,44 +3,30 @@ dnl PowerPC-32 mpn_add_n and mpn_sub_n.
dnl Copyright 2002, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C 603e: ?
-C 604e: ? old: 3.25
-C 75x (G3): ? old: 3.5
-C 7400,7410 (G4): 3.25
-C 744x,745x (G4+): 4
-C POWER3/PPC630 2
-C POWER4/PPC970 2.4
-C POWER5 2.75
-C POWER6 40-140
-C POWER7 3
+C cycles/limb
+C 603e: ?
+C 604e: ? old: 3.25
+C 75x (G3): ? old: 3.5
+C 7400,7410 (G4): 3.25
+C 744x,745x (G4+): 4
+C power4/ppc970: ? old: 2.0
+C power5: ? old: 2.5
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/gmp/mpn/powerpc32/bdiv_dbm1c.asm b/gmp/mpn/powerpc32/bdiv_dbm1c.asm
index 72b2c482e4..41870fbe8a 100644
--- a/gmp/mpn/powerpc32/bdiv_dbm1c.asm
+++ b/gmp/mpn/powerpc32/bdiv_dbm1c.asm
@@ -3,30 +3,19 @@ dnl PPC32 mpn_bdiv_dbm1c.
dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/darwin.m4 b/gmp/mpn/powerpc32/darwin.m4
index db4226800b..b76103a8ca 100644
--- a/gmp/mpn/powerpc32/darwin.m4
+++ b/gmp/mpn/powerpc32/darwin.m4
@@ -2,44 +2,31 @@ divert(-1)
dnl m4 macros for Mac OS 32-bit assembly.
dnl Copyright 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',`')
-dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
dnl
define(`PROLOGUE_cpu',
-m4_assert_numargs_range(1,2)
-`ifelse(`$2',toc,,
-`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
- .text
+m4_assert_numargs(1)
+` .text
.globl $1
.align 3
$1:')
diff --git a/gmp/mpn/powerpc32/diveby3.asm b/gmp/mpn/powerpc32/diveby3.asm
index 288a7d30ac..cf11a19824 100644
--- a/gmp/mpn/powerpc32/diveby3.asm
+++ b/gmp/mpn/powerpc32/diveby3.asm
@@ -1,32 +1,21 @@
dnl PowerPC-32 mpn_divexact_by3 -- mpn by 3 exact division
dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/divrem_2.asm b/gmp/mpn/powerpc32/divrem_2.asm
index c6e64efe23..3261cbd727 100644
--- a/gmp/mpn/powerpc32/divrem_2.asm
+++ b/gmp/mpn/powerpc32/divrem_2.asm
@@ -1,32 +1,21 @@
dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
-dnl Copyright 2007, 2008, 2012 Free Software Foundation, Inc.
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -95,9 +84,9 @@ C Compute di from d1
bge- cr7, L(9)
add r0, r0, r10
cmplw cr7, r0, r10
- cmplw cr6, r6, r0
+ cmplw cr6, r0, r6
addi r31, r31, -1 C q1--
- crorc 28, 28, 25
+ cror 28, 28, 25
bc+ 12, 28, L(9)
addi r31, r31, -1 C q1--
add r0, r0, r10
@@ -112,9 +101,9 @@ L(9): subf r0, r6, r0
bge- cr7, L(13)
add r0, r0, r10
cmplw cr7, r0, r10
- cmplw cr6, r11, r0
+ cmplw cr6, r0, r11
addi r6, r6, -1 C q0--
- crorc 28, 28, 25
+ cror 28, 28, 25
bc+ 12, 28, L(13)
C add r0, r0, r10 C final remainder
addi r6, r6, -1 C q0--
diff --git a/gmp/mpn/powerpc32/eabi.m4 b/gmp/mpn/powerpc32/eabi.m4
index cd7633c633..20f9a2f327 100644
--- a/gmp/mpn/powerpc32/eabi.m4
+++ b/gmp/mpn/powerpc32/eabi.m4
@@ -2,32 +2,21 @@ divert(-1)
dnl m4 macros for powerpc32 eABI assembly.
dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',`')
diff --git a/gmp/mpn/powerpc32/elf.m4 b/gmp/mpn/powerpc32/elf.m4
index a64a1271ff..ab1559ebd4 100644
--- a/gmp/mpn/powerpc32/elf.m4
+++ b/gmp/mpn/powerpc32/elf.m4
@@ -2,43 +2,31 @@ divert(-1)
dnl m4 macros for powerpc32 GNU/Linux assembly.
dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',`')
-dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
dnl
define(`PROLOGUE_cpu',
-m4_assert_numargs_range(1,2)
-`ifelse(`$2',toc,,
-`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+m4_assert_numargs(1)
+ `
.section ".text"
.align 3
.globl $1
diff --git a/gmp/mpn/powerpc32/gmp-mparam.h b/gmp/mpn/powerpc32/gmp-mparam.h
index 784a6d7b74..1676317a40 100644
--- a/gmp/mpn/powerpc32/gmp-mparam.h
+++ b/gmp/mpn/powerpc32/gmp-mparam.h
@@ -1,36 +1,25 @@
/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* This file is supposed to be used for 604, 604e, 744x/745x/747x (G4+), i.e.,
@@ -42,176 +31,53 @@ see https://www.gnu.org/licenses/. */
7400/7410 (G4), both which have much slower multiply instructions. */
/* 1417 MHz PPC 7447A */
-/* FFT tuning limit = 12500000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 49
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 69
-
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 73
-#define MUL_TOOM44_THRESHOLD 106
-#define MUL_TOOM6H_THRESHOLD 156
-#define MUL_TOOM8H_THRESHOLD 236
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 71
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 22
-#define SQR_TOOM3_THRESHOLD 74
-#define SQR_TOOM4_THRESHOLD 130
-#define SQR_TOOM6_THRESHOLD 189
-#define SQR_TOOM8_THRESHOLD 284
-
-#define MULMID_TOOM42_THRESHOLD 32
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 14
-
-#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 284, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 33, 8}, { 19, 7}, { 39, 8}, { 23, 7}, \
- { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95,10}, { 31, 9}, { 71, 8}, { 143, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 9}, { 135, 8}, \
- { 271, 9}, { 143,10}, { 79, 9}, { 159, 8}, \
- { 319, 9}, { 175,10}, { 95, 9}, { 191, 8}, \
- { 383, 9}, { 207, 8}, { 415,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415, 8}, { 831,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 415, 9}, \
- { 831,11}, { 223,10}, { 447, 9}, { 895,10}, \
- { 479, 9}, { 959,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,10}, { 831,11}, { 447,10}, { 895,11}, \
- { 479,10}, { 959,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
- { 639,10}, { 1279,11}, { 703,10}, { 1407,12}, \
- { 383,11}, { 831,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,10}, { 2431,12}, { 639,11}, { 1279,12}, \
- { 703,11}, { 1407,13}, { 383,12}, { 895,11}, \
- { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1471,13}, \
- { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \
- { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
- { 2431,13}, { 1407,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 160
-#define MUL_FFT_THRESHOLD 3712
-
-#define SQR_FFT_MODF_THRESHOLD 248 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 248, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 33, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \
- { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 71, 8}, { 143, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 7}, { 511, 9}, \
- { 143,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
- { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \
- { 383, 9}, { 207, 8}, { 415, 7}, { 831,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
- { 191, 9}, { 383,10}, { 207, 9}, { 415, 8}, \
- { 831,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447, 9}, { 895,12}, { 127,11}, { 255,10}, \
- { 543,11}, { 287,10}, { 607,11}, { 319,10}, \
- { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 447,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
- { 703,10}, { 1407,12}, { 383,11}, { 831,12}, \
- { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
- { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,13}, { 767,12}, { 1599,13}, \
- { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
- { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 154
-#define SQR_FFT_THRESHOLD 2688
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 45
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 44
-#define DC_DIVAPPR_Q_THRESHOLD 142
-#define DC_BDIV_QR_THRESHOLD 54
-#define DC_BDIV_Q_THRESHOLD 124
-
-#define INV_MULMOD_BNM1_THRESHOLD 43
-#define INV_NEWTON_THRESHOLD 179
-#define INV_APPR_THRESHOLD 157
-
-#define BINV_NEWTON_THRESHOLD 214
-#define REDC_1_TO_REDC_N_THRESHOLD 55
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 1078
-#define MUPI_DIV_QR_THRESHOLD 84
-#define MU_BDIV_QR_THRESHOLD 872
-#define MU_BDIV_Q_THRESHOLD 1078
-
-#define POWM_SEC_TABLE 1,19,102,428,1378
-
-#define MATRIX22_STRASSEN_THRESHOLD 12
-#define HGCD_THRESHOLD 120
-#define HGCD_APPR_THRESHOLD 166
-#define HGCD_REDUCE_THRESHOLD 1679
-#define GCD_DC_THRESHOLD 339
-#define GCDEXT_DC_THRESHOLD 273
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 781
-#define SET_STR_PRECOMPUTE_THRESHOLD 1505
-
-#define FAC_DSC_THRESHOLD 141
-#define FAC_ODD_THRESHOLD 29
+
+/* Generated by tuneup.c, 2009-01-14, gcc 4.3 */
+
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_KARATSUBA_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 77
+#define SQR_TOOM4_THRESHOLD 130
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 52
+#define MULLOW_MUL_N_THRESHOLD 292
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 46
+#define POWM_THRESHOLD 87
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 127
+#define GCD_DC_THRESHOLD 361
+#define GCDEXT_DC_THRESHOLD 382
+#define JACOBI_BASE_METHOD 1
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 7
+#define MOD_1_2_THRESHOLD 21
+#define MOD_1_4_THRESHOLD 68
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 22
+#define GET_STR_PRECOMPUTE_THRESHOLD 42
+#define SET_STR_DC_THRESHOLD 788
+#define SET_STR_PRECOMPUTE_THRESHOLD 1554
+
+#define MUL_FFT_TABLE { 304, 672, 1152, 2560, 6144, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD 320
+#define MUL_FFT_THRESHOLD 2816
+
+#define SQR_FFT_TABLE { 272, 672, 1152, 2560, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD 288
+#define SQR_FFT_THRESHOLD 2304
diff --git a/gmp/mpn/powerpc32/invert_limb.asm b/gmp/mpn/powerpc32/invert_limb.asm
deleted file mode 100644
index 612bfe523c..0000000000
--- a/gmp/mpn/powerpc32/invert_limb.asm
+++ /dev/null
@@ -1,142 +0,0 @@
-dnl PowerPC-32 mpn_invert_limb -- Invert a normalized limb.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C 603e: ?
-C 604e: ?
-C 75x (G3): ?
-C 7400,7410 (G4): ?
-C 744x,745x (G4+): 32
-C power4/ppc970: ?
-C power5: ?
-
-EXTERN(approx_tab)
-
-ASM_START()
-PROLOGUE(mpn_invert_limb)
- rlwinm r6, r3, 11, 22, 30 C extract bits 30..22 to pos 2^1
- srwi r10, r3, 11 C extract bits 31..11
- LEA( r9, approx_tab) C N.B. clobbers r0 for ELF and Darwin
- lhzx r9, r9, r6 C w2
- addi r0, r10, 1
- mullw r11, r9, r9
- slwi r9, r9, 4
- mulhwu r7, r11, r0
- rlwinm r11, r3, 0, 31, 31 C extract bit 0
- addi r0, r9, -1
- srwi r9, r3, 1 C d >> 1
- subf r0, r7, r0 C w1
- add r9, r9, r11 C d31
- mullw r9, r0, r9 C w1 * d31
- srwi r10, r0, 1 C w1 >> 1
- neg r11, r11
- and r11, r10, r11
- subf r11, r9, r11
- mulhwu r9, r11, r0
- slwi r0, r0, 15
- srwi r9, r9, 1
- add r0, r9, r0 C w0
- mullw r10, r0, r3
- mulhwu r9, r0, r3
- addc r11, r10, r3
- adde r3, r9, r3
- subf r3, r3, r0
- blr
-EPILOGUE()
-
-DEF_OBJECT(approx_tab)
- .short 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
- .short 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
- .short 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
- .short 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
- .short 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
- .short 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
- .short 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
- .short 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
- .short 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
- .short 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
- .short 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
- .short 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
- .short 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
- .short 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
- .short 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
- .short 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
- .short 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
- .short 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
- .short 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
- .short 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
- .short 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
- .short 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
- .short 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
- .short 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
- .short 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
- .short 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
- .short 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
- .short 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
- .short 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
- .short 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
- .short 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
- .short 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
- .short 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
- .short 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
- .short 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
- .short 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
- .short 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
- .short 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
- .short 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
- .short 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
- .short 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
- .short 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
- .short 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
- .short 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
- .short 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
- .short 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
- .short 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
- .short 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
- .short 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
- .short 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
- .short 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
- .short 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
- .short 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
- .short 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
- .short 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
- .short 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
- .short 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
- .short 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
- .short 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
- .short 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
- .short 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
- .short 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
- .short 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
- .short 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
-END_OBJECT(approx_tab)
-ASM_END()
diff --git a/gmp/mpn/powerpc32/lshift.asm b/gmp/mpn/powerpc32/lshift.asm
index 948f8c6cf3..e306173146 100644
--- a/gmp/mpn/powerpc32/lshift.asm
+++ b/gmp/mpn/powerpc32/lshift.asm
@@ -1,32 +1,22 @@
dnl PowerPC-32 mpn_lshift -- Shift a number left.
-dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc.
+dnl Copyright 1995, 1998, 2000, 2002, 2003, 2004, 2005 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -48,7 +38,7 @@ C cnt r6
ASM_START()
PROLOGUE(mpn_lshift)
- cmpwi cr0, r5, 30 C more than 30 limbs?
+ cmpwi cr0, r5, 12 C more than 12 limbs?
slwi r0, r5, 2
add r4, r4, r0 C make r4 point at end of s1
add r7, r3, r0 C make r7 point at end of res
@@ -163,4 +153,4 @@ L(loopU):
stw r12, -20(r7)
lmw r24, -32(r1) C restore registers
blr
-EPILOGUE()
+EPILOGUE(mpn_lshift)
diff --git a/gmp/mpn/powerpc32/lshiftc.asm b/gmp/mpn/powerpc32/lshiftc.asm
deleted file mode 100644
index 61606d1b66..0000000000
--- a/gmp/mpn/powerpc32/lshiftc.asm
+++ /dev/null
@@ -1,168 +0,0 @@
-dnl PowerPC-32 mpn_lshiftc.
-
-dnl Copyright 1995, 1998, 2000, 2002-2005, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C 603e: ?
-C 604e: 3.0
-C 75x (G3): 3.0
-C 7400,7410 (G4): 3.0
-C 7445,7455 (G4+): 2.5
-C 7447,7457 (G4+): 2.25
-C power4/ppc970: 2.5
-C power5: 2.5
-
-C INPUT PARAMETERS
-C rp r3
-C up r4
-C n r5
-C cnt r6
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- cmpwi cr0, r5, 30 C more than 30 limbs?
- slwi r0, r5, 2
- add r4, r4, r0 C make r4 point at end of s1
- add r7, r3, r0 C make r7 point at end of res
- bgt L(BIG) C branch if more than 12 limbs
-
- mtctr r5 C copy size into CTR
- subfic r8, r6, 32
- lwzu r11, -4(r4) C load first s1 limb
- srw r3, r11, r8 C compute function return value
- bdz L(end1)
-
-L(oop): lwzu r10, -4(r4)
- slw r9, r11, r6
- srw r12, r10, r8
- nor r9, r9, r12
- stwu r9, -4(r7)
- bdz L(end2)
- lwzu r11, -4(r4)
- slw r9, r10, r6
- srw r12, r11, r8
- nor r9, r9, r12
- stwu r9, -4(r7)
- bdnz L(oop)
-
-L(end1):
- slw r0, r11, r6
- nor r0, r0, r0
- stw r0, -4(r7)
- blr
-L(end2):
- slw r0, r10, r6
- nor r0, r0, r0
- stw r0, -4(r7)
- blr
-
-L(BIG):
- stmw r24, -32(r1) C save registers we are supposed to preserve
- lwzu r9, -4(r4)
- subfic r8, r6, 32
- srw r3, r9, r8 C compute function return value
- slw r0, r9, r6
- addi r5, r5, -1
-
- andi. r10, r5, 3 C count for spill loop
- beq L(e)
- mtctr r10
- lwzu r28, -4(r4)
- bdz L(xe0)
-
-L(loop0):
- slw r12, r28, r6
- srw r24, r28, r8
- lwzu r28, -4(r4)
- nor r24, r0, r24
- stwu r24, -4(r7)
- mr r0, r12
- bdnz L(loop0) C taken at most once!
-
-L(xe0): slw r12, r28, r6
- srw r24, r28, r8
- nor r24, r0, r24
- stwu r24, -4(r7)
- mr r0, r12
-
-L(e): srwi r5, r5, 2 C count for unrolled loop
- addi r5, r5, -1
- mtctr r5
- lwz r28, -4(r4)
- lwz r29, -8(r4)
- lwz r30, -12(r4)
- lwzu r31, -16(r4)
-
-L(loopU):
- slw r9, r28, r6
- srw r24, r28, r8
- lwz r28, -4(r4)
- slw r10, r29, r6
- srw r25, r29, r8
- lwz r29, -8(r4)
- slw r11, r30, r6
- srw r26, r30, r8
- lwz r30, -12(r4)
- slw r12, r31, r6
- srw r27, r31, r8
- lwzu r31, -16(r4)
- nor r24, r0, r24
- stw r24, -4(r7)
- nor r25, r9, r25
- stw r25, -8(r7)
- nor r26, r10, r26
- stw r26, -12(r7)
- nor r27, r11, r27
- stwu r27, -16(r7)
- mr r0, r12
- bdnz L(loopU)
-
- slw r9, r28, r6
- srw r24, r28, r8
- slw r10, r29, r6
- srw r25, r29, r8
- slw r11, r30, r6
- srw r26, r30, r8
- slw r12, r31, r6
- srw r27, r31, r8
- nor r24, r0, r24
- stw r24, -4(r7)
- nor r25, r9, r25
- stw r25, -8(r7)
- nor r26, r10, r26
- stw r26, -12(r7)
- nor r27, r11, r27
- stw r27, -16(r7)
- nor r12, r12, r12
- stw r12, -20(r7)
- lmw r24, -32(r1) C restore registers
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc32/mod_34lsub1.asm b/gmp/mpn/powerpc32/mod_34lsub1.asm
index 6d7fe4d089..fa0f0139ee 100644
--- a/gmp/mpn/powerpc32/mod_34lsub1.asm
+++ b/gmp/mpn/powerpc32/mod_34lsub1.asm
@@ -3,30 +3,19 @@ dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
dnl Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/mode1o.asm b/gmp/mpn/powerpc32/mode1o.asm
index e8a6b5e28a..ba9a393b09 100644
--- a/gmp/mpn/powerpc32/mode1o.asm
+++ b/gmp/mpn/powerpc32/mode1o.asm
@@ -1,32 +1,21 @@
dnl PowerPC-32 mpn_modexact_1_odd -- mpn by limb exact remainder.
dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/mul_1.asm b/gmp/mpn/powerpc32/mul_1.asm
index e42087cfa8..e6f44e21d9 100644
--- a/gmp/mpn/powerpc32/mul_1.asm
+++ b/gmp/mpn/powerpc32/mul_1.asm
@@ -5,30 +5,19 @@ dnl Copyright 1995, 1997, 2000, 2002, 2003, 2005 Free Software Foundation,
dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/p3-p7/aors_n.asm b/gmp/mpn/powerpc32/p3-p7/aors_n.asm
deleted file mode 100644
index c44df8fa50..0000000000
--- a/gmp/mpn/powerpc32/p3-p7/aors_n.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-
-dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 1.5
-C POWER4/PPC970 2
-C POWER5 2
-C POWER6 2.78
-C POWER7 2.15-2.87
-
-C This code is based on powerpc64/aors_n.asm.
-
-C INPUT PARAMETERS
-C rp r3
-C up r4
-C vp r5
-C n r6
-
-ifdef(`OPERATION_add_n',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)
- define(GENRVAL, `addi r3, r3, 1')
- define(SETCBR, `addic r0, $1, -1')
- define(CLRCB, `addic r0, r0, 0')
-')
-ifdef(`OPERATION_sub_n',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)
- define(GENRVAL, `neg r3, r3')
- define(SETCBR, `subfic r0, $1, 0')
- define(CLRCB, `addic r0, r1, -1')
-')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ASM_START()
-PROLOGUE(func_nc)
- SETCBR(r7)
- b L(ent)
-EPILOGUE()
-
-PROLOGUE(func)
- CLRCB
-L(ent): stw r31, -4(r1)
- stw r30, -8(r1)
- stw r29, -12(r1)
- stw r28, -16(r1)
-
- rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0
- cmpwi cr6, r0, 2
- addi r6, r6, 3 C compute count...
- srwi r6, r6, 2 C ...for ctr
- mtctr r6 C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- beq cr6, L(b10)
-
-L(b11): lwz r8, 0(r4) C load s1 limb
- lwz r9, 0(r5) C load s2 limb
- lwz r10, 4(r4) C load s1 limb
- lwz r11, 4(r5) C load s2 limb
- lwz r12, 8(r4) C load s1 limb
- addi r4, r4, 12
- lwz r0, 8(r5) C load s2 limb
- addi r5, r5, 12
- ADDSUBC r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- stw r29, 0(r3)
- stw r30, 4(r3)
- stw r31, 8(r3)
- addi r3, r3, 12
- bdnz L(go)
- b L(ret)
-
-L(b01): lwz r12, 0(r4) C load s1 limb
- addi r4, r4, 4
- lwz r0, 0(r5) C load s2 limb
- addi r5, r5, 4
- ADDSUBC r31, r0, r12 C add
- stw r31, 0(r3)
- addi r3, r3, 4
- bdnz L(go)
- b L(ret)
-
-L(b10): lwz r10, 0(r4) C load s1 limb
- lwz r11, 0(r5) C load s2 limb
- lwz r12, 4(r4) C load s1 limb
- addi r4, r4, 8
- lwz r0, 4(r5) C load s2 limb
- addi r5, r5, 8
- ADDSUBC r30, r11, r10 C add
- ADDSUBC r31, r0, r12 C add
- stw r30, 0(r3)
- stw r31, 4(r3)
- addi r3, r3, 8
- bdnz L(go)
- b L(ret)
-
-L(b00): C INITCY C clear/set cy
-L(go): lwz r6, 0(r4) C load s1 limb
- lwz r7, 0(r5) C load s2 limb
- lwz r8, 4(r4) C load s1 limb
- lwz r9, 4(r5) C load s2 limb
- lwz r10, 8(r4) C load s1 limb
- lwz r11, 8(r5) C load s2 limb
- lwz r12, 12(r4) C load s1 limb
- lwz r0, 12(r5) C load s2 limb
- bdz L(end)
-
- addi r4, r4, 16
- addi r5, r5, 16
-
- ALIGN(16)
-L(top): ADDSUBC r28, r7, r6
- lwz r6, 0(r4) C load s1 limb
- lwz r7, 0(r5) C load s2 limb
- ADDSUBC r29, r9, r8
- lwz r8, 4(r4) C load s1 limb
- lwz r9, 4(r5) C load s2 limb
- ADDSUBC r30, r11, r10
- lwz r10, 8(r4) C load s1 limb
- lwz r11, 8(r5) C load s2 limb
- ADDSUBC r31, r0, r12
- lwz r12, 12(r4) C load s1 limb
- lwz r0, 12(r5) C load s2 limb
- stw r28, 0(r3)
- addi r4, r4, 16
- stw r29, 4(r3)
- addi r5, r5, 16
- stw r30, 8(r3)
- stw r31, 12(r3)
- addi r3, r3, 16
- bdnz L(top) C decrement ctr and loop back
-
-L(end): ADDSUBC r28, r7, r6
- ADDSUBC r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- stw r28, 0(r3)
- stw r29, 4(r3)
- stw r30, 8(r3)
- stw r31, 12(r3)
-
-L(ret): lwz r31, -4(r1)
- lwz r30, -8(r1)
- lwz r29, -12(r1)
- lwz r28, -16(r1)
-
- subfe r3, r0, r0 C -cy
- GENRVAL
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc32/p3/gmp-mparam.h b/gmp/mpn/powerpc32/p3/gmp-mparam.h
deleted file mode 100644
index 33826956a2..0000000000
--- a/gmp/mpn/powerpc32/p3/gmp-mparam.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 450 MHz POWER3 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 10
-#define MUL_TOOM33_THRESHOLD 38
-#define MUL_TOOM44_THRESHOLD 58
-#define MUL_TOOM6H_THRESHOLD 129
-#define MUL_TOOM8H_THRESHOLD 212
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 63
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 59
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 14
-#define SQR_TOOM3_THRESHOLD 53
-#define SQR_TOOM4_THRESHOLD 76
-#define SQR_TOOM6_THRESHOLD 106
-#define SQR_TOOM8_THRESHOLD 284
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 9
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 23, 9}, { 7, 8}, { 15, 7}, \
- { 33, 8}, { 23, 9}, { 15, 8}, { 35, 9}, \
- { 23,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 71, 8}, { 143, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 9}, { 143,10}, { 79, 9}, \
- { 159, 8}, { 319, 9}, { 175, 8}, { 351,10}, \
- { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \
- { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 351, 9}, { 703, 8}, \
- { 1407,11}, { 191,10}, { 415,11}, { 223,10}, \
- { 447, 9}, { 895,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 82
-#define MUL_FFT_THRESHOLD 2688
-
-#define SQR_FFT_MODF_THRESHOLD 176 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 176, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \
- { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \
- { 7, 8}, { 15, 7}, { 31, 8}, { 23, 9}, \
- { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \
- { 575, 9}, { 79, 8}, { 159,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \
- { 95, 9}, { 191, 8}, { 383,10}, { 111, 9}, \
- { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 223,12}, { 63,11}, { 127,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \
- { 703, 8}, { 1407,11}, { 191,10}, { 383,11}, \
- { 223,10}, { 447, 9}, { 895,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 87
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 5240
-
-#define DC_DIV_QR_THRESHOLD 32
-#define DC_DIVAPPR_Q_THRESHOLD 123
-#define DC_BDIV_QR_THRESHOLD 34
-#define DC_BDIV_Q_THRESHOLD 84
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 129
-#define INV_APPR_THRESHOLD 124
-
-#define BINV_NEWTON_THRESHOLD 148
-#define REDC_1_TO_REDC_N_THRESHOLD 38
-
-#define MU_DIV_QR_THRESHOLD 748
-#define MU_DIVAPPR_Q_THRESHOLD 748
-#define MUPI_DIV_QR_THRESHOLD 59
-#define MU_BDIV_QR_THRESHOLD 562
-#define MU_BDIV_Q_THRESHOLD 654
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 76
-#define GCD_DC_THRESHOLD 205
-#define GCDEXT_DC_THRESHOLD 174
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 181
-#define SET_STR_PRECOMPUTE_THRESHOLD 525
diff --git a/gmp/mpn/powerpc32/p4/gmp-mparam.h b/gmp/mpn/powerpc32/p4/gmp-mparam.h
deleted file mode 100644
index 20830a0bd7..0000000000
--- a/gmp/mpn/powerpc32/p4/gmp-mparam.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2011, 2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/* 1800 MHz PowerPC-970 */
-/* FFT tuning limit = 10000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 42
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 45
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 73
-#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 222
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 107
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 92
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
-
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 160
-#define SQR_TOOM6_THRESHOLD 197
-#define SQR_TOOM8_THRESHOLD 357
-
-#define MULMID_TOOM42_THRESHOLD 32
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 16
-
-#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 444, 5}, { 17, 6}, { 9, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 24, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 167,10}, { 95, 9}, { 191, 8}, { 383,10}, \
- { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287, 8}, { 575, 9}, \
- { 303,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 335, 9}, { 671, 8}, { 1343,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \
- { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,11}, { 447,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \
- { 383,11}, { 767,10}, { 1535,11}, { 831,12}, \
- { 447,10}, { 1791,11}, { 959,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \
- { 831,11}, { 1727,10}, { 3455,11}, { 1791,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \
- { 1727,11}, { 3455,12}, { 1791,14}, { 511,13}, \
- { 1151,12}, { 2431,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 157
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 28, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \
- { 319,10}, { 95, 9}, { 191, 8}, { 383, 9}, \
- { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
- { 575, 9}, { 303, 8}, { 607,10}, { 159, 9}, \
- { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 575,10}, { 303, 9}, \
- { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \
- { 831,11}, { 223,10}, { 447,12}, { 127,11}, \
- { 255,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \
- { 383,11}, { 831,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
- { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
- { 2111,13}, { 1151,12}, { 2431,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 150
-#define SQR_FFT_THRESHOLD 4736
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 55
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 50
-#define DC_DIVAPPR_Q_THRESHOLD 196
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 166
-
-#define INV_MULMOD_BNM1_THRESHOLD 50
-#define INV_NEWTON_THRESHOLD 226
-#define INV_APPR_THRESHOLD 202
-
-#define BINV_NEWTON_THRESHOLD 228
-#define REDC_1_TO_REDC_N_THRESHOLD 67
-
-#define MU_DIV_QR_THRESHOLD 1187
-#define MU_DIVAPPR_Q_THRESHOLD 1308
-#define MUPI_DIV_QR_THRESHOLD 114
-#define MU_BDIV_QR_THRESHOLD 998
-#define MU_BDIV_Q_THRESHOLD 1142
-
-#define POWM_SEC_TABLE 3,28,78,480,1099
-
-#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 93
-#define HGCD_APPR_THRESHOLD 109
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 273
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 381
-#define SET_STR_PRECOMPUTE_THRESHOLD 1002
-
-#define FAC_DSC_THRESHOLD 179
-#define FAC_ODD_THRESHOLD 28
diff --git a/gmp/mpn/powerpc32/p5/gmp-mparam.h b/gmp/mpn/powerpc32/p5/gmp-mparam.h
deleted file mode 100644
index faa1e81da4..0000000000
--- a/gmp/mpn/powerpc32/p5/gmp-mparam.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1650 MHz POWER5 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 61
-
-#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 57
-#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 189
-#define MUL_TOOM8H_THRESHOLD 309
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-
-#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 40
-#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 124
-#define SQR_TOOM6_THRESHOLD 140
-#define SQR_TOOM8_THRESHOLD 238
-
-#define MULMID_TOOM42_THRESHOLD 40
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 16
-
-#define POWM_SEC_TABLE 4,29,252,840,2080
-
-#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \
- { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 71
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \
- { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
- { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415,11}, { 223,10}, { 447,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 68
-#define MULLO_MUL_N_THRESHOLD 9236
-
-#define DC_DIV_QR_THRESHOLD 69
-#define DC_DIVAPPR_Q_THRESHOLD 220
-#define DC_BDIV_QR_THRESHOLD 75
-#define DC_BDIV_Q_THRESHOLD 188
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 230
-#define INV_APPR_THRESHOLD 230
-
-#define BINV_NEWTON_THRESHOLD 278
-#define REDC_1_TO_REDC_N_THRESHOLD 87
-
-#define MU_DIV_QR_THRESHOLD 1210
-#define MU_DIVAPPR_Q_THRESHOLD 1308
-#define MUPI_DIV_QR_THRESHOLD 106
-#define MU_BDIV_QR_THRESHOLD 1017
-#define MU_BDIV_Q_THRESHOLD 1210
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 110
-#define HGCD_APPR_THRESHOLD 138
-#define HGCD_REDUCE_THRESHOLD 2578
-#define GCD_DC_THRESHOLD 408
-#define GCDEXT_DC_THRESHOLD 298
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 527
-#define SET_STR_PRECOMPUTE_THRESHOLD 1090
diff --git a/gmp/mpn/powerpc32/p6/gmp-mparam.h b/gmp/mpn/powerpc32/p6/gmp-mparam.h
deleted file mode 100644
index c9504b63b3..0000000000
--- a/gmp/mpn/powerpc32/p6/gmp-mparam.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 3500 MHz POWER6 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 19
-#define MUL_TOOM33_THRESHOLD 55
-#define MUL_TOOM44_THRESHOLD 88
-#define MUL_TOOM6H_THRESHOLD 137
-#define MUL_TOOM8H_THRESHOLD 181
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 56
-#define SQR_TOOM4_THRESHOLD 130
-#define SQR_TOOM6_THRESHOLD 189
-#define SQR_TOOM8_THRESHOLD 296
-
-#define MULMID_TOOM42_THRESHOLD 26
-
-#define MULMOD_BNM1_THRESHOLD 7
-#define SQRMOD_BNM1_THRESHOLD 12
-
-#define POWM_SEC_TABLE 2,26,127,453,1068
-
-#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \
- { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \
- { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \
- { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \
- { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
- { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \
- { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
- { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \
- { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 89
-#define MUL_FFT_THRESHOLD 1728
-
-#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \
- { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \
- { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \
- { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \
- { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \
- { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \
- { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \
- { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
- { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \
- { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \
- { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 92
-#define SQR_FFT_THRESHOLD 1600
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 57
-#define MULLO_MUL_N_THRESHOLD 3176
-
-#define DC_DIV_QR_THRESHOLD 52
-#define DC_DIVAPPR_Q_THRESHOLD 187
-#define DC_BDIV_QR_THRESHOLD 64
-#define DC_BDIV_Q_THRESHOLD 146
-
-#define INV_MULMOD_BNM1_THRESHOLD 68
-#define INV_NEWTON_THRESHOLD 182
-#define INV_APPR_THRESHOLD 182
-
-#define BINV_NEWTON_THRESHOLD 186
-#define REDC_1_TO_REDC_N_THRESHOLD 60
-
-#define MU_DIV_QR_THRESHOLD 924
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 73
-#define MU_BDIV_QR_THRESHOLD 667
-#define MU_BDIV_Q_THRESHOLD 823
-
-#define MATRIX22_STRASSEN_THRESHOLD 8
-#define HGCD_THRESHOLD 61
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 974
-#define GCD_DC_THRESHOLD 195
-#define GCDEXT_DC_THRESHOLD 134
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 9
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 190
-#define SET_STR_PRECOMPUTE_THRESHOLD 411
diff --git a/gmp/mpn/powerpc32/p7/gmp-mparam.h b/gmp/mpn/powerpc32/p7/gmp-mparam.h
deleted file mode 100644
index 35bb61dca2..0000000000
--- a/gmp/mpn/powerpc32/p7/gmp-mparam.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 3550 MHz POWER7/T4 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 34
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 89
-#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 286
-#define MUL_TOOM8H_THRESHOLD 363
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
-
-#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 89
-#define SQR_TOOM4_THRESHOLD 154
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 381
-
-#define MULMID_TOOM42_THRESHOLD 40
-
-#define MULMOD_BNM1_THRESHOLD 18
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define POWM_SEC_TABLE 4,35,225,780,2212
-
-#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 14, 5}, { 29, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
- { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
- { 1087,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671, 8}, { 1343,10}, { 351,11}, \
- { 191,10}, { 415, 9}, { 831,10}, { 431,11}, \
- { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 77
-#define MUL_FFT_THRESHOLD 5312
-
-#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
- { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543, 8}, { 1087,10}, { 287, 9}, { 575,10}, \
- { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \
- { 831,11}, { 223,10}, { 447,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 79
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 34
-#define MULLO_MUL_N_THRESHOLD 10323
-
-#define DC_DIV_QR_THRESHOLD 52
-#define DC_DIVAPPR_Q_THRESHOLD 202
-#define DC_BDIV_QR_THRESHOLD 68
-#define DC_BDIV_Q_THRESHOLD 152
-
-#define INV_MULMOD_BNM1_THRESHOLD 66
-#define INV_NEWTON_THRESHOLD 226
-#define INV_APPR_THRESHOLD 189
-
-#define BINV_NEWTON_THRESHOLD 292
-#define REDC_1_TO_REDC_N_THRESHOLD 79
-
-#define MU_DIV_QR_THRESHOLD 1442
-#define MU_DIVAPPR_Q_THRESHOLD 1442
-#define MUPI_DIV_QR_THRESHOLD 91
-#define MU_BDIV_QR_THRESHOLD 1308
-#define MU_BDIV_Q_THRESHOLD 1442
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 126
-#define HGCD_APPR_THRESHOLD 139
-#define HGCD_REDUCE_THRESHOLD 2681
-#define GCD_DC_THRESHOLD 573
-#define GCDEXT_DC_THRESHOLD 448
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 9
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 834
-#define SET_STR_PRECOMPUTE_THRESHOLD 1888
diff --git a/gmp/mpn/powerpc32/powerpc-defs.m4 b/gmp/mpn/powerpc32/powerpc-defs.m4
index 0c142a2e0c..33cf97e387 100644
--- a/gmp/mpn/powerpc32/powerpc-defs.m4
+++ b/gmp/mpn/powerpc32/powerpc-defs.m4
@@ -3,32 +3,21 @@ divert(-1)
dnl m4 macros for PowerPC assembler (32 and 64 bit).
dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
diff --git a/gmp/mpn/powerpc32/rshift.asm b/gmp/mpn/powerpc32/rshift.asm
index cb0046d5ee..b069a93d12 100644
--- a/gmp/mpn/powerpc32/rshift.asm
+++ b/gmp/mpn/powerpc32/rshift.asm
@@ -1,32 +1,22 @@
dnl PowerPC-32 mpn_rshift -- Shift a number right.
-dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc.
+dnl Copyright 1995, 1998, 2000, 2002, 2003, 2004, 2005 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -48,7 +38,7 @@ C cnt r6
ASM_START()
PROLOGUE(mpn_rshift)
- cmpwi cr0, r5, 30 C more than 30 limbs?
+ cmpwi cr0, r5, 12 C more than 12 limbs?
addi r7, r3, -4 C dst-4
bgt L(BIG) C branch if more than 12 limbs
@@ -161,4 +151,4 @@ L(loopU):
stw r12, 20(r7)
lmw r24, -32(r1) C restore registers
blr
-EPILOGUE()
+EPILOGUE(mpn_rshift)
diff --git a/gmp/mpn/powerpc32/sec_tabselect.asm b/gmp/mpn/powerpc32/sec_tabselect.asm
deleted file mode 100644
index a3f24d5678..0000000000
--- a/gmp/mpn/powerpc32/sec_tabselect.asm
+++ /dev/null
@@ -1,141 +0,0 @@
-dnl PowerPC-32 mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C 603e: ?
-C 604e: ?
-C 75x (G3): ?
-C 7400,7410 (G4): 2.5
-C 744x,745x (G4+): 2.0
-C power4/ppc970: 2.0
-C power5: ?
-
-define(`rp', `r3')
-define(`tp', `r4')
-define(`n', `r5')
-define(`nents', `r6')
-define(`which', `r7')
-
-define(`i', `r8')
-define(`j', `r9')
-define(`stride', `r12')
-define(`mask', `r11')
-
-
-ASM_START()
-PROLOGUE(mpn_sec_tabselect)
- addic. j, n, -4 C outer loop induction variable
- stmw r27, -32(r1)
- slwi stride, n, 2
-
- blt cr0, L(outer_end)
-L(outer_top):
- mtctr nents
- mr r10, tp
- li r28, 0
- li r29, 0
- li r30, 0
- li r31, 0
- addic. j, j, -4 C outer loop induction variable
- mr i, which
-
- ALIGN(16)
-L(top): addic i, i, -1 C set carry iff i != 0
- subfe mask, mask, mask
- lwz r0, 0(tp)
- lwz r27, 4(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r28, r28, r0
- or r29, r29, r27
- lwz r0, 8(tp)
- lwz r27, 12(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r30, r30, r0
- or r31, r31, r27
- add tp, tp, stride
- bdnz L(top)
-
- stw r28, 0(rp)
- stw r29, 4(rp)
- stw r30, 8(rp)
- stw r31, 12(rp)
- addi tp, r10, 16
- addi rp, rp, 16
- bge cr0, L(outer_top)
-L(outer_end):
-
- andi. r0, n, 2
- beq cr0, L(b0x)
-L(b1x): mtctr nents
- mr r10, tp
- li r28, 0
- li r29, 0
- mr i, which
- ALIGN(16)
-L(tp2): addic i, i, -1
- subfe mask, mask, mask
- lwz r0, 0(tp)
- lwz r27, 4(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r28, r28, r0
- or r29, r29, r27
- add tp, tp, stride
- bdnz L(tp2)
- stw r28, 0(rp)
- stw r29, 4(rp)
- addi tp, r10, 8
- addi rp, rp, 8
-
-L(b0x): andi. r0, n, 1
- beq cr0, L(b00)
-L(b01): mtctr nents
- mr r10, tp
- li r28, 0
- mr i, which
- ALIGN(16)
-L(tp1): addic i, i, -1
- subfe mask, mask, mask
- lwz r0, 0(tp)
- and r0, r0, mask
- or r28, r28, r0
- add tp, tp, stride
- bdnz L(tp1)
- stw r28, 0(rp)
-
-L(b00): lmw r27, -32(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm b/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm
deleted file mode 100644
index f7aba33ee5..0000000000
--- a/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,80 +0,0 @@
-dnl PowerPC-32 mpn_sqr_diag_addlsh1.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C 603e ?
-C 604e ?
-C 75x (G3) ?
-C 7400,7410 (G4) ?
-C 744x,745x (G4+) 6
-C power4/ppc970 ?
-C power5 ?
-
-C This has been feebly optimised for 7447 but not for any other CPU.
-
-define(`rp', r3)
-define(`tp', r4)
-define(`up', r5)
-define(`n', r6)
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
- addi n, n, -1
- addi tp, tp, -4
- mtctr n
- lwz r0, 0(up)
- li r10, 0
- mullw r7, r0, r0
- stw r7, 0(rp)
- mulhwu r6, r0, r0
- addic r31, r31, 0 C clear CF
-
- ALIGN(16)
-L(top): lwzu r0, 4(up)
- mullw r7, r0, r0
- lwz r8, 4(tp)
- lwzu r9, 8(tp)
- rlwimi r10, r8, 1,0,30
- srwi r11, r8, 31
- rlwimi r11, r9, 1,0,30
- adde r10, r10, r6
- adde r11, r11, r7
- stw r10, 4(rp)
- srwi r10, r9, 31
- mulhwu r6, r0, r0
- stwu r11, 8(rp)
- bdnz L(top)
-
- adde r10, r10, r6
- stw r10, 4(rp)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc32/sqr_diagonal.asm b/gmp/mpn/powerpc32/sqr_diagonal.asm
new file mode 100644
index 0000000000..d315349f63
--- /dev/null
+++ b/gmp/mpn/powerpc32/sqr_diagonal.asm
@@ -0,0 +1,103 @@
+dnl PowerPC-32 mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002, 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 603e: ?
+C 604e: 4.0
+C 75x (G3): 10.5
+C 7400,7410 (G4): 10.5
+C 744x,745x (G4+): 4.0
+C power4/ppc970: 8.6
+C power5: 7.0
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ lwz r6,0(r4)
+ mtctr r5
+
+ addi r3,r3,-4
+ bdz L(end1)
+
+ lwzu r7,4(r4)
+ mullw r9,r6,r6
+ mulhwu r11,r6,r6
+ bdz L(end2)
+
+ lwzu r6,4(r4)
+ mullw r8,r7,r7
+ mulhwu r10,r7,r7
+ bdz L(ende)
+
+L(loop):
+ lwzu r7,4(r4)
+ stw r9,4(r3)
+ mullw r9,r6,r6
+ stwu r11,8(r3)
+ mulhwu r11,r6,r6
+ bdz L(endo)
+ lwzu r6,4(r4)
+ stw r8,4(r3)
+ mullw r8,r7,r7
+ stwu r10,8(r3)
+ mulhwu r10,r7,r7
+ bdnz L(loop)
+
+L(ende):
+ stw r9,4(r3)
+ mullw r9,r6,r6
+ stw r11,8(r3)
+ mulhwu r11,r6,r6
+ stw r8,12(r3)
+ stw r10,16(r3)
+ stw r9,20(r3)
+ stw r11,24(r3)
+ blr
+L(endo):
+ stw r8,4(r3)
+ mullw r8,r7,r7
+ stw r10,8(r3)
+ mulhwu r10,r7,r7
+ stw r9,12(r3)
+ stw r11,16(r3)
+ stw r8,20(r3)
+ stw r10,24(r3)
+ blr
+
+L(end2):
+ mullw r8,r7,r7
+ stw r9,4(r3)
+ mulhwu r10,r7,r7
+ stw r11,8(r3)
+ stw r8,12(r3)
+ stw r10,16(r3)
+ blr
+L(end1):
+ mullw r9,r6,r6
+ mulhwu r11,r6,r6
+ stw r9,4(r3)
+ stw r11,8(r3)
+ blr
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/gmp/mpn/powerpc32/sublsh1_n.asm b/gmp/mpn/powerpc32/sublsh1_n.asm
index 6dc6460016..c8711d09a6 100644
--- a/gmp/mpn/powerpc32/sublsh1_n.asm
+++ b/gmp/mpn/powerpc32/sublsh1_n.asm
@@ -3,30 +3,19 @@ dnl PowerPC-32 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/submul_1.asm b/gmp/mpn/powerpc32/submul_1.asm
index 9fcdaa291b..ae40bb4473 100644
--- a/gmp/mpn/powerpc32/submul_1.asm
+++ b/gmp/mpn/powerpc32/submul_1.asm
@@ -5,30 +5,19 @@ dnl Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation,
dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/umul.asm b/gmp/mpn/powerpc32/umul.asm
index a5811e1651..400f009337 100644
--- a/gmp/mpn/powerpc32/umul.asm
+++ b/gmp/mpn/powerpc32/umul.asm
@@ -1,32 +1,21 @@
-dnl PowerPC-32 umul_ppmm -- support for longlong.h
+dnl PowerPC-32 umul_ppmm -- support for longlong.h
-dnl Copyright 2000, 2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+dnl General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/vmx/copyd.asm b/gmp/mpn/powerpc32/vmx/copyd.asm
index 6aac6b8389..e345eef01f 100644
--- a/gmp/mpn/powerpc32/vmx/copyd.asm
+++ b/gmp/mpn/powerpc32/vmx/copyd.asm
@@ -3,30 +3,19 @@ dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -48,7 +37,7 @@ C read-modify-write tricks.
C * The VMX code is used from the smallest sizes it handles, but measurements
C show a large speed bump at the cutoff points. Small copying (perhaps
C using some read-modify-write technique) should be optimized.
-C * Make a mpn_com based on this code.
+C * Make a mpn_com_n based on this code.
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
diff --git a/gmp/mpn/powerpc32/vmx/copyi.asm b/gmp/mpn/powerpc32/vmx/copyi.asm
index a97a0fa6dc..b6b2e7ea8d 100644
--- a/gmp/mpn/powerpc32/vmx/copyi.asm
+++ b/gmp/mpn/powerpc32/vmx/copyi.asm
@@ -3,30 +3,19 @@ dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -48,7 +37,7 @@ C read-modify-write tricks.
C * The VMX code is used from the smallest sizes it handles, but measurements
C show a large speed bump at the cutoff points. Small copying (perhaps
C using some read-modify-write technique) should be optimized.
-C * Make a mpn_com based on this code.
+C * Make a mpn_com_n based on this code.
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
diff --git a/gmp/mpn/powerpc32/vmx/logops_n.asm b/gmp/mpn/powerpc32/vmx/logops_n.asm
index d656d3b73f..7ed731e483 100644
--- a/gmp/mpn/powerpc32/vmx/logops_n.asm
+++ b/gmp/mpn/powerpc32/vmx/logops_n.asm
@@ -5,30 +5,19 @@ dnl logical operations.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm b/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm
index 9b7e4f1a50..8aee6f81de 100644
--- a/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm
+++ b/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
-dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
+dnl Copyright 2002, 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -146,15 +135,15 @@ L(large):
andi. r7, up, 15
vxor a0, v0, v0
- lis r9, 0xaaaa
+ lis r0, 0xaaaa
vxor a1, v0, v0
- ori r9, r9, 0xaaab
+ ori r0, r0, 0xaaab
vxor a2, v0, v0
li r5, 16
vxor c0, v0, v0
li r6, 32
vxor c1, v0, v0
- LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin
+ LEAL( r11, cnsts)
vxor c2, v0, v0
vxor z, v0, v0
@@ -169,7 +158,7 @@ L(large):
vsldoi a2, z, a2, 12
addi n, n, 9
- mulhwu r0, n, r9
+ mulhwu r0, n, r0
srwi r0, r0, 3 C r0 = floor(n/12)
mtctr r0
@@ -185,7 +174,7 @@ L(na4): bne cr7, L(na8)
vsldoi a1, z, a1, 8
addi n, n, 6
- mulhwu r0, n, r9
+ mulhwu r0, n, r0
srwi r0, r0, 3 C r0 = floor(n/12)
mtctr r0
@@ -199,7 +188,7 @@ L(na8):
vsldoi a0, z, a0, 4
addi n, n, 3
- mulhwu r0, n, r9
+ mulhwu r0, n, r0
srwi r0, r0, 3 C r0 = floor(n/12)
mtctr r0
@@ -208,7 +197,7 @@ L(na8):
b L(0)
L(aligned16):
- mulhwu r0, n, r9
+ mulhwu r0, n, r0
srwi r0, r0, 3 C r0 = floor(n/12)
mtctr r0
diff --git a/gmp/mpn/powerpc32/vmx/popcount.asm b/gmp/mpn/powerpc32/vmx/popcount.asm
index 943c92d127..62fcaaee4a 100644
--- a/gmp/mpn/powerpc32/vmx/popcount.asm
+++ b/gmp/mpn/powerpc32/vmx/popcount.asm
@@ -3,32 +3,26 @@ dnl PowerPC-32/VMX mpn_popcount.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
MULFUNC_PROLOGUE(mpn_popcount)
include_mpn(`powerpc64/vmx/popcount.asm')
+
+C cycles/limb
+C 7400,7410 (G4): 2.75
+C 744x,745x (G4+): 2.25
+C 970 (G5): 5.3
diff --git a/gmp/mpn/powerpc64/README b/gmp/mpn/powerpc64/README
index 50dd3995c3..757357b4d8 100644
--- a/gmp/mpn/powerpc64/README
+++ b/gmp/mpn/powerpc64/README
@@ -1,30 +1,19 @@
-Copyright 1999-2001, 2003-2005 Free Software Foundation, Inc.
+Copyright 1999, 2000, 2001, 2003, 2004, 2005 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -124,7 +113,7 @@ Memory: 2 ld/st. Stores go to the L2 cache, which can sustain just
one store per cycle.
L1 load latency: to gregs 3-4 cycles, to fregs 5-6 cycles.
Operations that modify the address register might be split
- to use also an integer issue slot.
+ to use also a an integer issue slot.
Simple integer: 2 operations every cycle, latency 2.
Integer multiply: 2 operations every 6th cycle, latency 7 cycles.
Integer divide: ?
@@ -150,7 +139,7 @@ Problem is to get 32-bit or 16-bit words to the fp registers. Only 64-bit fp
memops copies bits without fiddling with them. We might therefore need to
load to integer registers with zero extension, store as 64 bits into temp
space, and then load to fp regs. Alternatively, load directly to fp space
-and add well-chosen constants to get cancellation. (Other part after given by
+and add well-chosen constants to get cancelation. (Other part after given by
subsequent subtraction.)
Possible code mix for load-via-intregs variant:
diff --git a/gmp/mpn/powerpc64/aix.m4 b/gmp/mpn/powerpc64/aix.m4
index bf6517d69d..589686a868 100644
--- a/gmp/mpn/powerpc64/aix.m4
+++ b/gmp/mpn/powerpc64/aix.m4
@@ -1,53 +1,42 @@
divert(-1)
dnl m4 macros for AIX 64-bit assembly.
-dnl Copyright 2000-2002, 2005, 2006, 2010, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2000, 2001, 2002, 2005, 2006 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',
- `.machine "any"
+ `.machine "ppc64"
.toc')
-dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
dnl
dnl Don't want ELF style .size in the epilogue.
define(`PROLOGUE_cpu',
-m4_assert_numargs_range(1,2)
-`ifelse(`$2',toc,,
-`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+m4_assert_numargs(1)
+ `
.globl $1
.globl .$1
.csect [DS], 3
$1:
.llong .$1, TOC[tc0], 0
- .csect .$1[PR], 6
+ .csect [PR]
+ .align 4
.$1:')
define(`EPILOGUE_cpu',
@@ -92,6 +81,4 @@ define(`CALL',
define(`ASM_END', `TOC_ENTRY')
-undefine(`EXTRA_REGISTER')
-
divert
diff --git a/gmp/mpn/powerpc64/com.asm b/gmp/mpn/powerpc64/com.asm
deleted file mode 100644
index 074b7ff6e4..0000000000
--- a/gmp/mpn/powerpc64/com.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-dnl PowerPC-64 mpn_com.
-
-dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 1.25
-C POWER5 ?
-C POWER6 1.32
-C POWER7 1.13
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-
-ASM_START()
-PROLOGUE(mpn_com)
-
-ifdef(`HAVE_ABI_mode32',
-` rldicl n, n, 0,32')
-
- cmpdi cr0, n, 4
- blt L(sml)
-
- addi r10, n, 4
- srdi r10, r10, 3
- mtctr r10
-
- andi. r0, n, 1
- rlwinm r11, n, 0,30,30
- rlwinm r12, n, 0,29,29
- cmpdi cr6, r11, 0
- cmpdi cr7, r12, 0
-
- beq cr0, L(xx0)
-L(xx1): ld r6, 0(up)
- addi up, up, 8
- nor r6, r6, r6
- std r6, 0(rp)
- addi rp, rp, 8
-
-L(xx0): bne cr6, L(x10)
-L(x00): ld r6, 0(r4)
- ld r7, 8(r4)
- bne cr7, L(100)
-L(000): addi rp, rp, -32
- b L(lo0)
-L(100): addi up, up, -32
- b L(lo4)
-L(x10): ld r8, 0(r4)
- ld r9, 8(r4)
- bne cr7, L(110)
-L(010): addi up, up, 16
- addi rp, rp, -16
- b L(lo2)
-L(110): addi up, up, -16
- addi rp, rp, -48
- b L(lo6)
-
-L(sml): mtctr n
-L(t): ld r6, 0(up)
- addi up, up, 8
- nor r6, r6, r6
- std r6, 0(rp)
- addi rp, rp, 8
- bdnz L(t)
- blr
-
- ALIGN(32)
-L(top): nor r6, r6, r6
- nor r7, r7, r7
- std r6, 0(rp)
- std r7, 8(rp)
-L(lo2): ld r6, 0(up)
- ld r7, 8(up)
- nor r8, r8, r8
- nor r9, r9, r9
- std r8, 16(rp)
- std r9, 24(rp)
-L(lo0): ld r8, 16(up)
- ld r9, 24(up)
- nor r6, r6, r6
- nor r7, r7, r7
- std r6, 32(rp)
- std r7, 40(rp)
-L(lo6): ld r6, 32(up)
- ld r7, 40(up)
- nor r8, r8, r8
- nor r9, r9, r9
- std r8, 48(rp)
- std r9, 56(rp)
- addi rp, rp, 64
-L(lo4): ld r8, 48(up)
- ld r9, 56(up)
- addi up, up, 64
- bdnz L(top)
-
-L(end): nor r6, r6, r6
- nor r7, r7, r7
- std r6, 0(rp)
- std r7, 8(rp)
- nor r8, r8, r8
- nor r9, r9, r9
- std r8, 16(rp)
- std r9, 24(rp)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/com_n.asm b/gmp/mpn/powerpc64/com_n.asm
new file mode 100644
index 0000000000..0c43d06cfe
--- /dev/null
+++ b/gmp/mpn/powerpc64/com_n.asm
@@ -0,0 +1,74 @@
+dnl PowerPC-64 mpn_com_n.
+
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 1?
+C POWER4/PPC970: 1.6
+
+C TODO
+C * 8-way unrolling brings timing down to about 1.3 cycles/limb.
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+
+ASM_START()
+PROLOGUE(mpn_com_n)
+ rldic. r0, r5, 3, 59 C r0 = (r5 & 3) << 3; cr0 = (n == 4t)?
+ cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 2)?
+
+ addi r5, r5, 3 C compute...
+ifdef(`HAVE_ABI_mode32',
+` rldicl r5, r5, 62,34', C ...branch count
+` rldicl r5, r5, 62, 2') C ...branch count
+ mtctr r5
+
+ add r4, r4, r0 C offset up
+ add r3, r3, r0 C offset rp
+
+ beq cr0, L(L00)
+ blt cr6, L(L01)
+ beq cr6, L(L10)
+ b L(L11)
+
+L(L00): addi r4, r4, 32
+ addi r3, r3, 32
+
+ ALIGN(16)
+L(oop): ld r6, -32(r4)
+ nor r6, r6, r6
+ std r6, -32(r3)
+L(L11): ld r6, -24(r4)
+ nor r6, r6, r6
+ std r6, -24(r3)
+L(L10): ld r6, -16(r4)
+ nor r6, r6, r6
+ std r6, -16(r3)
+L(L01): ld r6, -8(r4)
+ nor r6, r6, r6
+ addi r4, r4, 32
+ std r6, -8(r3)
+ addi r3, r3, 32
+ bdnz L(oop)
+
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/copyd.asm b/gmp/mpn/powerpc64/copyd.asm
index c6ce9309f1..6a46a433c9 100644
--- a/gmp/mpn/powerpc64/copyd.asm
+++ b/gmp/mpn/powerpc64/copyd.asm
@@ -3,39 +3,25 @@ dnl PowerPC-64 mpn_copyd
dnl Copyright 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1
-C POWER4/PPC970 1
-C POWER5 ?
-C POWER6 ?
-C POWER7 1.4
+C cycles/limb
+C POWER3/PPC630: 1
+C POWER4/PPC970: 1
C INPUT PARAMETERS
C rp r3
diff --git a/gmp/mpn/powerpc64/copyi.asm b/gmp/mpn/powerpc64/copyi.asm
index 9a86cb21cc..5cb7e48565 100644
--- a/gmp/mpn/powerpc64/copyi.asm
+++ b/gmp/mpn/powerpc64/copyi.asm
@@ -3,39 +3,25 @@ dnl PowerPC-64 mpn_copyi.
dnl Copyright 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1
-C POWER4/PPC970 1
-C POWER5 ?
-C POWER6 ?
-C POWER7 1.4
+C cycles/limb
+C POWER3/PPC630: 1
+C POWER4/PPC970: 1
C INPUT PARAMETERS
C rp r3
diff --git a/gmp/mpn/powerpc64/darwin.m4 b/gmp/mpn/powerpc64/darwin.m4
index a3180e48fd..10055be13a 100644
--- a/gmp/mpn/powerpc64/darwin.m4
+++ b/gmp/mpn/powerpc64/darwin.m4
@@ -2,48 +2,35 @@ divert(-1)
dnl m4 macros for Mac OS 64-bit assembly.
dnl Copyright 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`ASM_START',`')
-dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
dnl
define(`DARWIN')
define(`PROLOGUE_cpu',
-m4_assert_numargs_range(1,2)
-`ifelse(`$2',toc,,
-`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
- .text
+m4_assert_numargs(1)
+` .text
.globl $1
- .align 5
+ .align 4
$1:')
define(`EPILOGUE_cpu',
@@ -114,6 +101,4 @@ define(`CALL',
define(`ASM_END', `dnl')
-define(`EXTRA_REGISTER', r2)
-
divert
diff --git a/gmp/mpn/powerpc64/elf.m4 b/gmp/mpn/powerpc64/elf.m4
index ddb5a8ed79..e6da11f90c 100644
--- a/gmp/mpn/powerpc64/elf.m4
+++ b/gmp/mpn/powerpc64/elf.m4
@@ -2,60 +2,31 @@ divert(-1)
dnl m4 macros for powerpc64 GNU/Linux assembly.
dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-define(`ASM_START',
-`ifdef(`ELFv2_ABI',
-`
- .abiversion 2
-')')
+define(`ASM_START',`')
-dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
dnl
define(`PROLOGUE_cpu',
-m4_assert_numargs_range(1,2)
-`ifelse(`$2',toc,,
-`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
-ifdef(`ELFv2_ABI',
-`
- .globl $1
- .type $1, @function
- .section ".text"
- .align 5
-$1:
-ifelse(`$2',toc,`
-0: addis 2, 12, (.TOC.-0b)@ha
- addi 2, 2, (.TOC.-0b)@l
- .localentry $1, .-$1
-',)
-',`
+m4_assert_numargs(1)
+ `
.globl $1
.globl .$1
.section ".opd","aw"
@@ -65,17 +36,12 @@ $1:
.size $1, 24
.type .$1, @function
.section ".text"
- .align 5
-.$1:
-')')
+ .align 4
+.$1:')
define(`EPILOGUE_cpu',
m4_assert_numargs(1)
-`ifdef(`ELFv2_ABI',`
- .size $1, .-$1
-',`
- .size .$1, .-.$1
-')')
+` .size .$1, .-.$1')
define(`TOC_ENTRY', `')
@@ -118,6 +84,4 @@ define(`CALL',
define(`ASM_END', `TOC_ENTRY')
-undefine(`EXTRA_REGISTER')
-
divert
diff --git a/gmp/mpn/powerpc64/gmp-mparam.h b/gmp/mpn/powerpc64/gmp-mparam.h
new file mode 100644
index 0000000000..e0ab478e3e
--- /dev/null
+++ b/gmp/mpn/powerpc64/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1995, 1999, 2000, 2001, 2002, 2004 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+
+
+/* ???MHz ppc630 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc "2.9" */
+
+#define MUL_KARATSUBA_THRESHOLD 8
+#define MUL_TOOM3_THRESHOLD 41
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_KARATSUBA_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 48
+
+#define DIV_SB_PREINV_THRESHOLD 0
+#define DIV_DC_THRESHOLD 28
+#define POWM_THRESHOLD 40
+
+#define HGCD_THRESHOLD 56
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 408
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 22
+#define SET_STR_THRESHOLD 1815
+
+#define MUL_FFT_TABLE { 272, 544, 1344, 2304, 5120, 20480, 49152, 0 }
+#define MUL_FFT_MODF_THRESHOLD 216
+#define MUL_FFT_THRESHOLD 1408
+
+#define SQR_FFT_TABLE { 272, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#define SQR_FFT_MODF_THRESHOLD 200
+#define SQR_FFT_THRESHOLD 1408
diff --git a/gmp/mpn/powerpc64/logops_n.asm b/gmp/mpn/powerpc64/logops_n.asm
index 2fa6985d7a..917b59f455 100644
--- a/gmp/mpn/powerpc64/logops_n.asm
+++ b/gmp/mpn/powerpc64/logops_n.asm
@@ -1,42 +1,28 @@
dnl PowerPC-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
-dnl Copyright 2003-2005 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.75
-C POWER4/PPC970 2.10
-C POWER5 ?
-C POWER6 ?
-C POWER7 1.75
+C cycles/limb
+C POWER3/PPC630: 1.75
+C POWER4/PPC970: 2.10
C n POWER3/PPC630 POWER4/PPC970
C 1 15.00 15.33
diff --git a/gmp/mpn/powerpc64/lshift.asm b/gmp/mpn/powerpc64/lshift.asm
index 880944a4ae..41e5ddd8e5 100644
--- a/gmp/mpn/powerpc64/lshift.asm
+++ b/gmp/mpn/powerpc64/lshift.asm
@@ -1,207 +1,116 @@
dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
-dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
-include(`../config.m4')
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
-C POWER7 2.15
+include(`../config.m4')
-C TODO
-C * Try to reduce the number of needed live registers
-C * Micro-optimise header code
-C * Keep in synch with rshift.asm and lshiftc.asm
+C cycles/limb
+C POWER3/PPC630: 1.5
+C POWER4/PPC970: 3.0
C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
+define(`rp',`r3')
+define(`up',`r4')
+define(`n',`r5')
+define(`cnt',`r6')
+
+define(`tnc',`r5')
+define(`v0',`r0')
+define(`v1',`r7')
+define(`u0',`r8')
+define(`u1',`r9')
+define(`h0',`r10')
+define(`h1',`r11')
-define(`tnc',`r0')
-define(`u0',`r30')
-define(`u1',`r31')
-define(`retval',`r5')
ASM_START()
PROLOGUE(mpn_lshift)
- std r31, -8(r1)
- std r30, -16(r1)
- subfic tnc, cnt, 64
- sldi r7, n, 3 C byte count corresponding to n
- add up, up, r7 C up = up + n
- add rp, rp, r7 C rp = rp + n
- rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
- cmpdi cr6, r30, 2
- addi r31, n, 3 C compute count...
- ld r10, -8(up) C load 1st limb for b00...b11
- srd retval, r10, tnc
ifdef(`HAVE_ABI_mode32',
-` rldicl r31, r31, 62,34', C ...branch count
-` srdi r31, r31, 2') C ...for ctr
- mtctr r31 C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- ld r11, -16(up) C load 2nd limb for b10 and b11
- beq cr6, L(b10)
-
- ALIGN(16)
-L(b11): sld r8, r10, cnt
- srd r9, r11, tnc
- ld u1, -24(up)
- addi up, up, -24
- sld r12, r11, cnt
- srd r7, u1, tnc
- addi rp, rp, 16
- bdnz L(gt3)
-
- or r11, r8, r9
- sld r8, u1, cnt
- b L(cj3)
-
- ALIGN(16)
-L(gt3): ld u0, -8(up)
- or r11, r8, r9
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -16(up)
- or r10, r12, r7
- b L(L11)
-
- ALIGN(32)
-L(b10): sld r12, r10, cnt
- addi rp, rp, 24
- srd r7, r11, tnc
- bdnz L(gt2)
-
- sld r8, r11, cnt
- or r10, r12, r7
- b L(cj2)
-
-L(gt2): ld u0, -24(up)
- sld r8, r11, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- or r10, r12, r7
- sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -40(up)
- or r11, r8, r9
- addi up, up, -16
- b L(L10)
-
- ALIGN(16)
-L(b00): ld u1, -16(up)
- sld r12, r10, cnt
- srd r7, u1, tnc
- ld u0, -24(up)
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- or r10, r12, r7
- sld r12, u0, cnt
- srd r7, u1, tnc
- addi rp, rp, 8
- bdz L(cj4)
-
-L(gt4): addi up, up, -32
- ld u0, -8(up)
- or r11, r8, r9
- b L(L00)
-
- ALIGN(16)
-L(b01): bdnz L(gt1)
- sld r8, r10, cnt
- std r8, -8(rp)
- b L(ret)
-
-L(gt1): ld u0, -16(up)
- sld r8, r10, cnt
- srd r9, u0, tnc
- ld u1, -24(up)
- sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -32(up)
- or r11, r8, r9
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -40(up)
- addi up, up, -40
- or r10, r12, r7
- bdz L(end)
-
- ALIGN(32)
-L(top): sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -8(up)
- std r11, -8(rp)
- or r11, r8, r9
-L(L00): sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -16(up)
- std r10, -16(rp)
- or r10, r12, r7
-L(L11): sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -24(up)
- std r11, -24(rp)
- or r11, r8, r9
-L(L10): sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- addi up, up, -32
- std r10, -32(rp)
- addi rp, rp, -32
- or r10, r12, r7
- bdnz L(top)
-
- ALIGN(32)
-L(end): sld r12, u0, cnt
- srd r7, u1, tnc
- std r11, -8(rp)
-L(cj4): or r11, r8, r9
- sld r8, u1, cnt
- std r10, -16(rp)
-L(cj3): or r10, r12, r7
- std r11, -24(rp)
-L(cj2): std r10, -32(rp)
- std r8, -40(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
+` rldicl r7, r5, 0, 32 C zero extend n
+ mtctr r7', C copy n to count register
+` mtctr n') C copy n to count register
+
+ifdef(`HAVE_ABI_mode32',
+` rldic r0, n, 3, 32', C byte count corresponding to n
+` rldicr r0, n, 3, 60') C byte count corresponding to n
+
+ add rp, rp, r0 C rp = rp + n
+ add up, up, r0 C up = up + n
+ addi rp, rp, 8 C rp now points 16 beyond end
+ addi up, up, -8 C up now points to last limb
+ subfic tnc, cnt, 64 C reverse shift count
+
+ ld u0, 0(up)
+ sld h0, u0, cnt
+ srd r12, u0, tnc C return value
+ bdz L(1) C jump for n = 1
+
+ ld u1, -8(up)
+ bdz L(2) C jump for n = 2
+
+ ldu u0, -16(up)
+ bdz L(end) C jump for n = 3
+
+L(oop): srd v1, u1, tnc
+ sld h1, u1, cnt
+ ld u1, -8(up)
+ or h0, v1, h0
+ stdu h0, -16(rp)
+
+ bdz L(exit)
+
+ srd v0, u0, tnc
+ sld h0, u0, cnt
+ ldu u0, -16(up)
+ or h1, v0, h1
+ std h1, -8(rp)
+
+ bdnz L(oop)
+
+L(end): srd v1, u1, tnc
+ sld h1, u1, cnt
+ or h0, v1, h0
+ stdu h0, -16(rp)
+ srd v0, u0, tnc
+ sld h0, u0, cnt
+ or h1, v0, h1
+ std h1, -8(rp)
+L(1): std h0, -16(rp)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, r12, 32
+ mr r4, r12
+',` mr r3, r12
+')
+ blr
+
+L(exit): srd v0, u0, tnc
+ sld h0, u0, cnt
+ or h1, v0, h1
+ std h1, -8(rp)
+L(2): srd v1, u1, tnc
+ sld h1, u1, cnt
+ or h0, v1, h0
+ stdu h0, -16(rp)
+ std h1, -8(rp)
ifdef(`HAVE_ABI_mode32',
-` srdi r3, retval, 32
- mr r4, retval
-',` mr r3, retval')
+` srdi r3, r12, 32
+ mr r4, r12
+',` mr r3, r12
+')
blr
EPILOGUE()
diff --git a/gmp/mpn/powerpc64/lshiftc.asm b/gmp/mpn/powerpc64/lshiftc.asm
deleted file mode 100644
index 7cf6a83428..0000000000
--- a/gmp/mpn/powerpc64/lshiftc.asm
+++ /dev/null
@@ -1,210 +0,0 @@
-dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
-
-dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.5
-C POWER7 2.15
-
-C TODO
-C * Try to reduce the number of needed live registers
-C * Micro-optimise header code
-C * Keep in synch with lshift.asm and rshift.asm
-C * Could the long-scheduled std insns be less scheduled?
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
-
-define(`tnc',`r0')
-define(`u0',`r30')
-define(`u1',`r31')
-define(`retval',`r5')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- std r31, -8(r1)
- std r30, -16(r1)
- subfic tnc, cnt, 64
- sldi r7, n, 3 C byte count corresponding to n
- add up, up, r7 C up = up + n
- add rp, rp, r7 C rp = rp + n
- rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
- cmpdi cr6, r30, 2
- addi r31, n, 3 C compute count...
- ld r10, -8(up) C load 1st limb for b00...b11
- srd retval, r10, tnc
- srdi r31, r31, 2 C ...for ctr
- mtctr r31 C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- ld r11, -16(up) C load 2nd limb for b10 and b11
- beq cr6, L(b10)
-
- ALIGN(16)
-L(b11): sld r8, r10, cnt
- srd r9, r11, tnc
- ld u1, -24(up)
- addi up, up, -24
- sld r12, r11, cnt
- srd r7, u1, tnc
- addi rp, rp, 16
- bdnz L(gt3)
-
- nor r11, r8, r9
- sld r8, u1, cnt
- nor r8, r8, r8
- b L(cj3)
-
- ALIGN(16)
-L(gt3): ld u0, -8(up)
- nor r11, r8, r9
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -16(up)
- nor r10, r12, r7
- b L(L11)
-
- ALIGN(32)
-L(b10): sld r12, r10, cnt
- addi rp, rp, 24
- srd r7, r11, tnc
- bdnz L(gt2)
-
- sld r8, r11, cnt
- nor r10, r12, r7
- nor r8, r8, r8
- b L(cj2)
-
-L(gt2): ld u0, -24(up)
- sld r8, r11, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- nor r10, r12, r7
- sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -40(up)
- nor r11, r8, r9
- addi up, up, -16
- b L(L10)
-
- ALIGN(16)
-L(b00): ld u1, -16(up)
- sld r12, r10, cnt
- srd r7, u1, tnc
- ld u0, -24(up)
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- nor r10, r12, r7
- sld r12, u0, cnt
- srd r7, u1, tnc
- addi rp, rp, 8
- bdz L(cj4)
-
-L(gt4): addi up, up, -32
- ld u0, -8(up)
- nor r11, r8, r9
- b L(L00)
-
- ALIGN(16)
-L(b01): bdnz L(gt1)
- sld r8, r10, cnt
- nor r8, r8, r8
- std r8, -8(rp)
- b L(ret)
-
-L(gt1): ld u0, -16(up)
- sld r8, r10, cnt
- srd r9, u0, tnc
- ld u1, -24(up)
- sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -32(up)
- nor r11, r8, r9
- sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -40(up)
- addi up, up, -40
- nor r10, r12, r7
- bdz L(end)
-
- ALIGN(32)
-L(top): sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -8(up)
- std r11, -8(rp)
- nor r11, r8, r9
-L(L00): sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -16(up)
- std r10, -16(rp)
- nor r10, r12, r7
-L(L11): sld r12, u0, cnt
- srd r7, u1, tnc
- ld u0, -24(up)
- std r11, -24(rp)
- nor r11, r8, r9
-L(L10): sld r8, u1, cnt
- srd r9, u0, tnc
- ld u1, -32(up)
- addi up, up, -32
- std r10, -32(rp)
- addi rp, rp, -32
- nor r10, r12, r7
- bdnz L(top)
-
- ALIGN(32)
-L(end): sld r12, u0, cnt
- srd r7, u1, tnc
- std r11, -8(rp)
-L(cj4): nor r11, r8, r9
- sld r8, u1, cnt
- std r10, -16(rp)
- nor r8, r8, r8
-L(cj3): nor r10, r12, r7
- std r11, -24(rp)
-L(cj2): std r10, -32(rp)
- std r8, -40(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
-ifdef(`HAVE_ABI_mode32',
-` srdi r3, retval, 32
- mr r4, retval
-',` mr r3, retval')
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/add_n.asm b/gmp/mpn/powerpc64/mode32/add_n.asm
index 1da8087fe1..4c62041e73 100644
--- a/gmp/mpn/powerpc64/mode32/add_n.asm
+++ b/gmp/mpn/powerpc64/mode32/add_n.asm
@@ -1,33 +1,22 @@
dnl PowerPC-64/mode32 mpn_add_n -- Add two limb vectors of the same length > 0
dnl and store sum in a third limb vector.
-dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/mode32/addmul_1.asm b/gmp/mpn/powerpc64/mode32/addmul_1.asm
index bdc39512ac..41a90781a5 100644
--- a/gmp/mpn/powerpc64/mode32/addmul_1.asm
+++ b/gmp/mpn/powerpc64/mode32/addmul_1.asm
@@ -1,33 +1,22 @@
dnl PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
dnl the result to a second limb vector.
-dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/mode32/mul_1.asm b/gmp/mpn/powerpc64/mode32/mul_1.asm
index 3a17e98797..091be4d272 100644
--- a/gmp/mpn/powerpc64/mode32/mul_1.asm
+++ b/gmp/mpn/powerpc64/mode32/mul_1.asm
@@ -1,33 +1,22 @@
dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and add
dnl the result to a second limb vector.
-dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h b/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h
deleted file mode 100644
index a7271381c5..0000000000
--- a/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 2008, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/* 1800 MHz PPC970 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD 12
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 90
-
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 57
-#define MUL_TOOM44_THRESHOLD 94
-#define MUL_TOOM6H_THRESHOLD 125
-#define MUL_TOOM8H_THRESHOLD 187
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 61
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
-
-#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 98
-#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 180
-#define SQR_TOOM8_THRESHOLD 272
-
-#define MULMID_TOOM42_THRESHOLD 34
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
- { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \
- { 13, 7}, { 29, 8}, { 19, 9}, { 11, 8}, \
- { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
- { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \
- { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \
- { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \
- { 39, 9}, { 83,10}, { 47, 9}, { 95, 8}, \
- { 191, 9}, { 99,10}, { 55,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \
- { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
- { 319,11}, { 47,10}, { 95, 9}, { 191, 8}, \
- { 383,10}, { 103,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
- { 287,11}, { 79,10}, { 159, 9}, { 319, 8}, \
- { 639,10}, { 175, 9}, { 351, 8}, { 703,11}, \
- { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
- { 207, 9}, { 415,10}, { 223, 9}, { 447,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1151,11}, \
- { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
- { 351, 9}, { 703,12}, { 95,11}, { 191,10}, \
- { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \
- { 831,11}, { 223,10}, { 447,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 106
-#define MUL_FFT_THRESHOLD 2688
-
-#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 212, 5}, { 13, 6}, { 15, 7}, { 8, 6}, \
- { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \
- { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
- { 11, 7}, { 23, 8}, { 13, 7}, { 27, 9}, \
- { 7, 8}, { 21, 9}, { 11, 8}, { 25,10}, \
- { 7, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 67,10}, { 39, 9}, { 79, 8}, { 159,10}, \
- { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \
- { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
- { 319,11}, { 47, 9}, { 191, 8}, { 383,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \
- { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
- { 175, 9}, { 351, 8}, { 703,10}, { 191, 9}, \
- { 383, 8}, { 767,10}, { 207, 9}, { 415,11}, \
- { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 175,10}, { 351, 9}, { 703, 8}, { 1407,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
- { 415,11}, { 223,10}, { 447,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 102
-#define SQR_FFT_THRESHOLD 1984
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 55
-#define MULLO_MUL_N_THRESHOLD 5240
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 108
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 126
-
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 129
-#define INV_APPR_THRESHOLD 116
-
-#define BINV_NEWTON_THRESHOLD 198
-#define REDC_1_TO_REDC_N_THRESHOLD 51
-
-#define MU_DIV_QR_THRESHOLD 807
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 54
-#define MU_BDIV_QR_THRESHOLD 748
-#define MU_BDIV_Q_THRESHOLD 872
-
-#define POWM_SEC_TABLE 4,35,152,780,2145
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 104
-#define HGCD_APPR_THRESHOLD 118
-#define HGCD_REDUCE_THRESHOLD 1329
-#define GCD_DC_THRESHOLD 268
-#define GCDEXT_DC_THRESHOLD 241
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 9
-#define GET_STR_PRECOMPUTE_THRESHOLD 18
-#define SET_STR_DC_THRESHOLD 996
-#define SET_STR_PRECOMPUTE_THRESHOLD 2170
-
-#define FAC_DSC_THRESHOLD 442
-#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm b/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm
deleted file mode 100644
index ff5f4b3cfb..0000000000
--- a/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm
+++ /dev/null
@@ -1,117 +0,0 @@
-dnl PowerPC-64 mpn_sqr_diagonal.
-
-dnl Copyright 2001-2003, 2005, 2006, 20010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 18
-C POWER4/PPC970 ?
-C POWER5 7.25
-C POWER6 9.5
-
-C INPUT PARAMETERS
-define(`rp', r3)
-define(`up', r4)
-define(`n', r5)
-
-ASM_START()
-PROLOGUE(mpn_sqr_diagonal)
-ifdef(`HAVE_ABI_mode32',
-` rldicl n, n, 0, 32') C zero extend n
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- addi n, n, 3 C compute count...
- cmpdi cr6, r0, 2
- srdi n, n, 2 C ...for ctr
- mtctr n C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- beq cr6, L(b10)
-
-L(b11): ld r0, 0(up)
- ld r10, 8(up)
- ld r12, 16(up)
- addi rp, rp, -16
- mulld r7, r0, r0
- mulhdu r8, r0, r0
- mulld r9, r10, r10
- mulhdu r10, r10, r10
- mulld r11, r12, r12
- mulhdu r12, r12, r12
- addi up, up, 24
- b L(11)
-
- ALIGN(16)
-L(b01): ld r0, 0(up)
- addi rp, rp, -48
- addi up, up, 8
- mulld r11, r0, r0
- mulhdu r12, r0, r0
- b L(01)
-
- ALIGN(16)
-L(b10): ld r0, 0(up)
- ld r12, 8(up)
- addi rp, rp, -32
- addi up, up, 16
- mulld r9, r0, r0
- mulhdu r10, r0, r0
- mulld r11, r12, r12
- mulhdu r12, r12, r12
- b L(10)
-
- ALIGN(32)
-L(b00):
-L(top): ld r0, 0(up)
- ld r8, 8(up)
- ld r10, 16(up)
- ld r12, 24(up)
- mulld r5, r0, r0
- mulhdu r6, r0, r0
- mulld r7, r8, r8
- mulhdu r8, r8, r8
- mulld r9, r10, r10
- mulhdu r10, r10, r10
- mulld r11, r12, r12
- mulhdu r12, r12, r12
- addi up, up, 32
- std r5, 0(rp)
- std r6, 8(rp)
-L(11): std r7, 16(rp)
- std r8, 24(rp)
-L(10): std r9, 32(rp)
- std r10, 40(rp)
-L(01): std r11, 48(rp)
- std r12, 56(rp)
- addi rp, rp, 64
- bdnz L(top)
-
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode32/sub_n.asm b/gmp/mpn/powerpc64/mode32/sub_n.asm
index 6fdc1d4719..5bcc4a47b5 100644
--- a/gmp/mpn/powerpc64/mode32/sub_n.asm
+++ b/gmp/mpn/powerpc64/mode32/sub_n.asm
@@ -1,33 +1,22 @@
dnl PowerPC-64/mode32 mpn_sub_n -- Subtract two limb vectors of the same
dnl length and store difference in a third limb vector.
-dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/mode32/submul_1.asm b/gmp/mpn/powerpc64/mode32/submul_1.asm
index 22601c417e..44ac326994 100644
--- a/gmp/mpn/powerpc64/mode32/submul_1.asm
+++ b/gmp/mpn/powerpc64/mode32/submul_1.asm
@@ -1,33 +1,22 @@
dnl PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
dnl the result from a second limb vector.
-dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/mode64/addlsh1_n.asm b/gmp/mpn/powerpc64/mode64/addlsh1_n.asm
new file mode 100644
index 0000000000..15182e1024
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/addlsh1_n.asm
@@ -0,0 +1,82 @@
+dnl PowerPC-64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER4/PPC970: 4 (2.0 c/l should be possible)
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`u0',`r8')
+define(`v0',`r10')
+define(`v1',`r11')
+
+ASM_START()
+PROLOGUE(mpn_addlsh1_n)
+ mtctr r6 C copy n in ctr
+ addic r31, r31, 0 C clear cy
+
+ ld v0, 0(vp) C load v limb
+ ld u0, 0(up) C load u limb
+ addi up, up, -8 C update up
+ addi rp, rp, -8 C update rp
+ sldi s1, v0, 1
+ bdz L(end) C If done, skip loop
+
+L(oop): ld v1, 8(vp) C load v limb
+ adde s1, s1, u0 C add limbs with cy, set cy
+ std s1, 8(rp) C store result limb
+ srdi s0, v0, 63 C shift down previous v limb
+ ldu u0, 16(up) C load u limb and update up
+ rldimi s0, v1, 1, 0 C left shift v limb and merge with prev v limb
+
+ bdz L(exit) C decrement ctr and exit if done
+
+ ldu v0, 16(vp) C load v limb and update vp
+ adde s0, s0, u0 C add limbs with cy, set cy
+ stdu s0, 16(rp) C store result limb and update rp
+ srdi s1, v1, 63 C shift down previous v limb
+ ld u0, 8(up) C load u limb
+ rldimi s1, v0, 1, 0 C left shift v limb and merge with prev v limb
+
+ bdnz L(oop) C decrement ctr and loop back
+
+L(end): adde r7, s1, u0
+ std r7, 8(rp) C store last result limb
+ srdi r3, v0, 63
+ addze r3, r3
+ blr
+L(exit): adde r7, s0, u0
+ std r7, 16(rp) C store last result limb
+ srdi r3, v1, 63
+ addze r3, r3
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/addmul_1.asm b/gmp/mpn/powerpc64/mode64/addmul_1.asm
new file mode 100644
index 0000000000..cadab3adf8
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/addmul_1.asm
@@ -0,0 +1,185 @@
+dnl PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 6-18
+C POWER4/PPC970: 8
+C POWER5: 8
+
+C TODO
+C * Reduce the number of registers used. Some mul destination registers could
+C be coalesced.
+C * Delay std for preserving registers, and suppress them for n=1.
+C * Write faster feed-in code. If nothing else, avoid one or two up updates.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r26, 0(up)
+ ld r28, 0(rp)
+ addi up, up, 8
+ nop
+ mulld r0, r26, r6
+ mulhdu r12, r26, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ b L(fic)
+
+L(b00): ld r26, 0(up)
+ ld r27, 8(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ addi up, up, 16
+ nop
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r7, r7, r5
+ addze r12, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ b L(fic)
+
+L(b01): bdnz L(gt1)
+ ld r26, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r26, r6
+ mulhdu r8, r26, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ b L(ret)
+L(gt1): ld r26, 0(up)
+ ld r27, 8(up)
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ ld r26, 16(up)
+ ld r28, 0(rp)
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r9, r26, r6
+ mulhdu r10, r26, r6
+ addc r7, r7, r5
+ adde r9, r9, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r9, r9, r30
+ std r9, 16(rp)
+ addi up, up, 24
+ addi rp, rp, 24
+ b L(fic)
+
+L(b10): addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+L(fic): ld r26, 0(up)
+ ld r27, 8(up)
+ addi up, up, 16
+ bdz L(end)
+ C registers dying
+L(top): mulld r0, r26, r6 C
+ mulhdu r5, r26, r6 C 26
+ ld r26, 0(up) C
+ ld r28, 0(rp) C
+ mulld r7, r27, r6 C
+ mulhdu r8, r27, r6 C 27
+ ld r27, 8(up) C
+ ld r29, 8(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r5 C 5 7
+ mulld r9, r26, r6 C
+ mulhdu r10, r26, r6 C 26
+ ld r26, 16(up) C
+ ld r30, 16(rp) C
+ mulld r11, r27, r6 C
+ mulhdu r12, r27, r6 C 27
+ ld r27, 24(up) C
+ ld r31, 24(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 16(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(top) C
+
+L(end): mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ ld r28, 0(rp)
+ nop
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r29, 8(rp)
+ nop
+ adde r0, r0, r12
+ adde r7, r7, r5
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+L(ret): addze r3, r8
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/aors_n.asm b/gmp/mpn/powerpc64/mode64/aors_n.asm
index 0e8474fdcc..42b6d79472 100644
--- a/gmp/mpn/powerpc64/mode64/aors_n.asm
+++ b/gmp/mpn/powerpc64/mode64/aors_n.asm
@@ -1,41 +1,56 @@
dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.5
-C POWER4/PPC970 2
-C POWER5 2
-C POWER6 2.63
-C POWER7 2.25-2.87
+C cycles/limb
+C POWER3/PPC630: 1.5
+C POWER4/PPC970: 2
+
+C n POWER3/PPC630 POWER4/PPC970
+C 1 17.00 19.00
+C 2 9.00 10.49
+C 3 5.33 7.66
+C 4 4.50 5.14
+C 5 4.20 4.80
+C 6 3.83 4.33
+C 7 3.00 3.99
+C 8 2.87 3.55
+C 9 2.89 3.40
+C 10 2.60 3.42
+C 11 2.45 3.15
+C 12 2.41 2.99
+C 13 2.46 3.01
+C 14 2.42 2.97
+C 15 2.20 2.85
+C 50 1.78 2.44
+C 100 1.83 2.20
+C 200 1.55 2.12
+C 400 1.53 2.05
+C 1000 1.98 2.02#
+C 2000 1.50# 2.04
+C 4000 2.55 2.50
+C 8000 2.70 2.45
+C 16000 2.65 5.94
+C 32000 2.62 16.41
+C 64000 2.73 18.94
C This code is a little bit slower for POWER3/PPC630 than the simple code used
C previously, but it is much faster for POWER4/PPC970. The reason for the
@@ -147,8 +162,7 @@ L(go): ld r6, 0(r4) C load s1 limb
addi r4, r4, 32
addi r5, r5, 32
- ALIGN(16)
-L(top): ADDSUBC r28, r7, r6
+L(oop): ADDSUBC r28, r7, r6
ld r6, 0(r4) C load s1 limb
ld r7, 0(r5) C load s2 limb
ADDSUBC r29, r9, r8
@@ -167,7 +181,7 @@ L(top): ADDSUBC r28, r7, r6
std r30, 16(r3)
std r31, 24(r3)
addi r3, r3, 32
- bdnz L(top) C decrement ctr and loop back
+ bdnz L(oop) C decrement ctr and loop back
L(end): ADDSUBC r28, r7, r6
ADDSUBC r29, r9, r8
diff --git a/gmp/mpn/powerpc64/mode64/aorsmul_1.asm b/gmp/mpn/powerpc64/mode64/aorsmul_1.asm
deleted file mode 100644
index 0c12f9b660..0000000000
--- a/gmp/mpn/powerpc64/mode64/aorsmul_1.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1.
-
-dnl Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mpn_addmul_1 mpn_submul_1
-C cycles/limb cycles/limb
-C POWER3/PPC630 6-18 6-18
-C POWER4/PPC970 8 8.3
-C POWER5 8 8.25
-C POWER6 16.25 16.75
-C POWER7 3.77 4.9
-
-C TODO
-C * Try to reduce the number of needed live registers
-C * Add support for _1c entry points
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`vl', `r6')
-
-ifdef(`OPERATION_addmul_1',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_addmul_1)
- define(func_nc, mpn_addmul_1c) C FIXME: not really supported
- define(SM, `')
-')
-ifdef(`OPERATION_submul_1',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_submul_1)
- define(func_nc, mpn_submul_1c) C FIXME: not really supported
- define(SM, `$1')
-')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
-PROLOGUE(func)
- std r31, -8(r1)
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- std r30, -16(r1)
- cmpdi cr6, r0, 2
- std r29, -24(r1)
- addi n, n, 3 C compute count...
- std r28, -32(r1)
- srdi n, n, 2 C ...for ctr
- std r27, -40(r1)
- mtctr n C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- beq cr6, L(b10)
-
-L(b11): ld r9, 0(up)
- ld r28, 0(rp)
- mulld r0, r9, r6
- mulhdu r12, r9, r6
- ADDSUB r0, r0, r28
- std r0, 0(rp)
- addi rp, rp, 8
- ld r9, 8(up)
- ld r27, 16(up)
- addi up, up, 24
-SM(` subfe r11, r11, r11 ')
- b L(bot)
-
- ALIGN(16)
-L(b00): ld r9, 0(up)
- ld r27, 8(up)
- ld r28, 0(rp)
- ld r29, 8(rp)
- mulld r0, r9, r6
- mulhdu r5, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- addc r7, r7, r5
- addze r12, r8
- ADDSUB r0, r0, r28
- std r0, 0(rp)
- ADDSUBC r7, r7, r29
- std r7, 8(rp)
- addi rp, rp, 16
- ld r9, 16(up)
- ld r27, 24(up)
- addi up, up, 32
-SM(` subfe r11, r11, r11 ')
- b L(bot)
-
- ALIGN(16)
-L(b01): bdnz L(gt1)
- ld r9, 0(up)
- ld r11, 0(rp)
- mulld r0, r9, r6
- mulhdu r8, r9, r6
- ADDSUB r0, r0, r11
- std r0, 0(rp)
-SM(` subfe r11, r11, r11 ')
-SM(` addic r11, r11, 1 ')
- addze r3, r8
- blr
-L(gt1): ld r9, 0(up)
- ld r27, 8(up)
- mulld r0, r9, r6
- mulhdu r5, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 16(up)
- ld r28, 0(rp)
- ld r29, 8(rp)
- ld r30, 16(rp)
- mulld r11, r9, r6
- mulhdu r10, r9, r6
- addc r7, r7, r5
- adde r11, r11, r8
- addze r12, r10
- ADDSUB r0, r0, r28
- std r0, 0(rp)
- ADDSUBC r7, r7, r29
- std r7, 8(rp)
- ADDSUBC r11, r11, r30
- std r11, 16(rp)
- addi rp, rp, 24
- ld r9, 24(up)
- ld r27, 32(up)
- addi up, up, 40
-SM(` subfe r11, r11, r11 ')
- b L(bot)
-
-L(b10): addic r0, r0, 0
- li r12, 0 C cy_limb = 0
- ld r9, 0(up)
- ld r27, 8(up)
- bdz L(end)
- addi up, up, 16
-
- ALIGN(16)
-L(top): mulld r0, r9, r6
- mulhdu r5, r9, r6 C 9
- mulld r7, r27, r6
- mulhdu r8, r27, r6 C 27
- ld r9, 0(up)
- ld r28, 0(rp)
- ld r27, 8(up)
- ld r29, 8(rp)
- adde r0, r0, r12 C 0 12
- adde r7, r7, r5 C 5 7
- mulld r5, r9, r6
- mulhdu r10, r9, r6 C 9
- mulld r11, r27, r6
- mulhdu r12, r27, r6 C 27
- ld r9, 16(up)
- ld r30, 16(rp)
- ld r27, 24(up)
- ld r31, 24(rp)
- adde r5, r5, r8 C 8 5
- adde r11, r11, r10 C 10 11
- addze r12, r12 C 12
- ADDSUB r0, r0, r28 C 0 28
- std r0, 0(rp) C 0
- ADDSUBC r7, r7, r29 C 7 29
- std r7, 8(rp) C 7
- ADDSUBC r5, r5, r30 C 5 30
- std r5, 16(rp) C 5
- ADDSUBC r11, r11, r31 C 11 31
- std r11, 24(rp) C 11
- addi up, up, 32
-SM(` subfe r11, r11, r11 ')
- addi rp, rp, 32
-L(bot):
-SM(` addic r11, r11, 1 ')
- bdnz L(top)
-
-L(end): mulld r0, r9, r6
- mulhdu r5, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r28, 0(rp)
- ld r29, 8(rp)
- adde r0, r0, r12
- adde r7, r7, r5
- addze r8, r8
- ADDSUB r0, r0, r28
- std r0, 0(rp)
- ADDSUBC r7, r7, r29
- std r7, 8(rp)
-SM(` subfe r11, r11, r11 ')
-SM(` addic r11, r11, 1 ')
- addze r3, r8
- ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
deleted file mode 100644
index 2c5400ab52..0000000000
--- a/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
-
-include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
deleted file mode 100644
index 447791abb0..0000000000
--- a/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
-
-include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm
deleted file mode 100644
index 6158f541fc..0000000000
--- a/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C POWER3/PPC630 1.83 (1.5 c/l should be possible)
-C POWER4/PPC970 3 (2.0 c/l should be possible)
-C POWER5 3
-C POWER6 3.5-47
-C POWER7 3
-
-C STATUS
-C * Try combining upx+up, and vpx+vp.
-C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
-C greater than the 2nd operand. Yes, this addition is non-commutative wrt
-C performance.
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`vp', `r5')
-define(`n', `r6')
-
-ifdef(`DO_add', `
- define(`ADDSUBC', `addc $1, $2, $3')
- define(`ADDSUBE', `adde $1, $2, $3')
- define(INITCY, `addic $1, r1, 0')
- define(RETVAL, `addze r3, $1')
- define(`func', mpn_addlsh`'LSH`'_n)')
-ifdef(`DO_sub', `
- define(`ADDSUBC', `subfc $1, $2, $3')
- define(`ADDSUBE', `subfe $1, $2, $3')
- define(INITCY, `addic $1, r1, -1')
- define(RETVAL, `subfze r3, $1
- neg r3, r3')
- define(`func', mpn_sublsh`'LSH`'_n)')
-ifdef(`DO_rsb', `
- define(`ADDSUBC', `subfc $1, $3, $2')
- define(`ADDSUBE', `subfe $1, $3, $2')
- define(INITCY, `addic $1, r1, -1')
- define(RETVAL, `addme r3, $1')
- define(`func', mpn_rsblsh`'LSH`'_n)')
-
-define(`rpx', `r6')
-define(`upx', `r7')
-define(`vpx', `r12')
-
-define(`s0', `r0') define(`s1', `r9')
-define(`u0', `r8')
-define(`v0', `r10') define(`v1', `r11')
-
-
-ASM_START()
-PROLOGUE(func)
- cmpldi cr0, n, 13
- bgt L(big)
-
- mtctr n C copy n in ctr
- INITCY( r0) C clear cy
-
- ld v0, 0(vp) C load v limb
- ld u0, 0(up) C load u limb
- addi up, up, -8 C update up
- addi rp, rp, -8 C update rp
- sldi s1, v0, LSH
- bdz L(ex1) C If done, skip loop
-
- ALIGN(16)
-L(lo0): ld v1, 8(vp) C load v limb
- ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
- ldu u0, 16(up) C load u limb and update up
- srdi s0, v0, RSH C shift down previous v limb
- std s1, 8(rp) C store result limb
- rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb
- bdz L(ex0) C decrement ctr and exit if done
- ldu v0, 16(vp) C load v limb and update vp
- ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
- ld u0, 8(up) C load u limb
- srdi s1, v1, RSH C shift down previous v limb
- stdu s0, 16(rp) C store result limb and update rp
- rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb
- bdnz L(lo0) C decrement ctr and loop back
-
-L(ex1): ADDSUBE(r7, s1, u0)
- std r7, 8(rp) C store last result limb
- srdi r0, v0, RSH
- RETVAL( r0)
- blr
-L(ex0): ADDSUBE(r7, s0, u0)
- std r7, 16(rp) C store last result limb
- srdi r0, v1, RSH
- RETVAL( r0)
- blr
-
-
-L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0
- addi r6, n, -1 C ...for ctr
- srdi r6, r6, 1 C ...for ctr
- mtctr r6 C copy count into ctr
- beq cr0, L(b0)
-
-L(b1): ld v1, 0(vp)
- ld u0, 0(up)
- sldi s1, v1, LSH
- srdi s0, v1, RSH
- ld v0, 8(vp)
- ADDSUBC(s1, s1, u0) C add limbs without cy, set cy
- addi rpx, rp, -16
- addi rp, rp, -8
- sub upx, up, rp
- sub vpx, vp, rp
- sub up, up, rpx
- sub vp, vp, rpx
- addi up, up, 8
- addi upx, upx, 16
- addi vp, vp, 16
- addi vpx, vpx, 24
- b L(mid)
-
-L(b0): ld v0, 0(vp)
- ld u0, 0(up)
- sldi s0, v0, LSH
- srdi s1, v0, RSH
- ld v1, 8(vp)
- ADDSUBC(s0, s0, u0) C add limbs without cy, set cy
- addi rpx, rp, -8
- addi rp, rp, -16
- sub upx, up, rpx
- sub vpx, vp, rpx
- sub up, up, rp
- sub vp, vp, rp
- addi up, up, 8
- addi upx, upx, 16
- addi vp, vp, 16
- addi vpx, vpx, 24
-
- ALIGN(32)
-L(top): ldx u0, rp, up
- ldx v0, rp, vp
- rldimi s1, v1, LSH, 0
- stdu s0, 16(rp)
- srdi s0, v1, RSH
- ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
-L(mid): ldx u0, rpx, upx
- ldx v1, rpx, vpx
- rldimi s0, v0, LSH, 0
- stdu s1, 16(rpx)
- srdi s1, v0, RSH
- ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
- bdnz L(top) C decrement CTR and loop back
-
- ldx u0, rp, up
- rldimi s1, v1, LSH, 0
- std s0, 16(rp)
- srdi s0, v1, RSH
- ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
- std s1, 24(rp)
-
- RETVAL( s0)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
index 45cded9715..8c1e87e1ee 100644
--- a/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
+++ b/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -1,41 +1,28 @@
dnl PPC64 mpn_bdiv_dbm1c.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8.25
-C POWER5 8.5 fluctuating as function of n % 3
-C POWER6 15
-C POWER7 4.75
+C cycles/limb
+C POWER3/PPC630: 6-18
+C POWER4/PPC970: 8.5
+C POWER5: ?
C TODO
C * Nothing to do...
@@ -61,7 +48,6 @@ PROLOGUE(mpn_bdiv_dbm1c)
blt cr6, L(b01)
beq cr6, L(b10)
- ALIGN(16)
L(b11): mulld r5, r0, r6
mulhdu r12, r0, r6
ld r0, 8(r4)
@@ -69,14 +55,13 @@ L(b11): mulld r5, r0, r6
addi r3, r3, -24
b L(3)
- ALIGN(16)
L(b00): mulld r9, r0, r6
mulhdu r8, r0, r6
+ ld r0, 8(r4)
addi r4, r4, -16
addi r3, r3, -16
b L(0)
- ALIGN(16)
L(b01): mulld r5, r0, r6
mulhdu r12, r0, r6
addi r3, r3, -8
@@ -85,43 +70,42 @@ L(b01): mulld r5, r0, r6
addi r4, r4, -8
b L(1)
- ALIGN(16)
L(b10): mulld r9, r0, r6
mulhdu r8, r0, r6
+ ld r0, 8(r4)
ble cr7, L(e2)
ALIGN(16)
-L(top): subfc r11, r9, r7
- ld r10, 8(r4)
+L(top): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ subfc r11, r9, r7
ld r0, 16(r4)
subfe r7, r8, r11
std r11, 0(r3)
- mulld r5, r10, r6
- mulhdu r12, r10, r6
L(1): mulld r9, r0, r6
mulhdu r8, r0, r6
subfc r11, r5, r7
+ ld r0, 24(r4)
subfe r7, r12, r11
std r11, 8(r3)
-L(0): subfc r11, r9, r7
- ld r10, 24(r4)
+L(0): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ subfc r11, r9, r7
ld r0, 32(r4)
subfe r7, r8, r11
std r11, 16(r3)
- mulld r5, r10, r6
- mulhdu r12, r10, r6
L(3): mulld r9, r0, r6
mulhdu r8, r0, r6
subfc r11, r5, r7
+ ld r0, 40(r4)
subfe r7, r12, r11
std r11, 24(r3)
addi r4, r4, 32
addi r3, r3, 32
bdnz L(top)
-L(e2): ld r10, 8(r4)
- mulld r5, r10, r6
- mulhdu r12, r10, r6
+L(e2): mulld r5, r0, r6
+ mulhdu r12, r0, r6
subfc r11, r9, r7
subfe r7, r8, r11
std r11, 0(r3)
diff --git a/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm b/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm
deleted file mode 100644
index 24968c1912..0000000000
--- a/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n.
-
-dnl Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 2.25
-C POWER5 ?
-C POWER6 3
-C POWER7 2
-
-C INPUT PARAMETERS
-define(`cnd', `r3')
-define(`rp', `r4')
-define(`up', `r5')
-define(`vp', `r6')
-define(`n', `r7')
-
-ifdef(`OPERATION_cnd_add_n',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_cnd_add_n)
- define(GENRVAL, `addi r3, r3, 1')
- define(SETCBR, `addic r0, $1, -1')
- define(CLRCB, `addic r0, r0, 0')
-')
-ifdef(`OPERATION_cnd_sub_n',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_cnd_sub_n)
- define(GENRVAL, `neg r3, r3')
- define(SETCBR, `subfic r0, $1, 0')
- define(CLRCB, `addic r0, r1, -1')
-')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
-
- subfic cnd, cnd, 0
- subfe cnd, cnd, cnd
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 3 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- beq cr6, L(b10)
-
-L(b11): ld r8, 0(up) C load s1 limb
- ld r9, 0(vp) C load s2 limb
- ld r10, 8(up) C load s1 limb
- ld r11, 8(vp) C load s2 limb
- ld r12, 16(up) C load s1 limb
- addi up, up, 24
- ld r0, 16(vp) C load s2 limb
- addi vp, vp, 24
- and r9, r9, cnd
- and r11, r11, cnd
- and r0, r0, cnd
- ADDSUB r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- std r29, 0(rp)
- std r30, 8(rp)
- std r31, 16(rp)
- addi rp, rp, 24
- bdnz L(go)
- b L(ret)
-
-L(b01): ld r12, 0(up) C load s1 limb
- addi up, up, 8
- ld r0, 0(vp) C load s2 limb
- addi vp, vp, 8
- and r0, r0, cnd
- ADDSUB r31, r0, r12 C add
- std r31, 0(rp)
- addi rp, rp, 8
- bdnz L(go)
- b L(ret)
-
-L(b10): ld r10, 0(up) C load s1 limb
- ld r11, 0(vp) C load s2 limb
- ld r12, 8(up) C load s1 limb
- addi up, up, 16
- ld r0, 8(vp) C load s2 limb
- addi vp, vp, 16
- and r11, r11, cnd
- and r0, r0, cnd
- ADDSUB r30, r11, r10 C add
- ADDSUBC r31, r0, r12 C add
- std r30, 0(rp)
- std r31, 8(rp)
- addi rp, rp, 16
- bdnz L(go)
- b L(ret)
-
-L(b00): CLRCB C clear/set cy
-L(go): ld r7, 0(up) C load s1 limb
- ld r27, 0(vp) C load s2 limb
- ld r8, 8(up) C load s1 limb
- ld r9, 8(vp) C load s2 limb
- ld r10, 16(up) C load s1 limb
- ld r11, 16(vp) C load s2 limb
- ld r12, 24(up) C load s1 limb
- ld r0, 24(vp) C load s2 limb
- and r27, r27, cnd
- and r9, r9, cnd
- and r11, r11, cnd
- and r0, r0, cnd
- bdz L(end)
-
- addi up, up, 32
- addi vp, vp, 32
-
-L(top): ADDSUBC r28, r27, r7
- ld r7, 0(up) C load s1 limb
- ld r27, 0(vp) C load s2 limb
- ADDSUBC r29, r9, r8
- ld r8, 8(up) C load s1 limb
- ld r9, 8(vp) C load s2 limb
- ADDSUBC r30, r11, r10
- ld r10, 16(up) C load s1 limb
- ld r11, 16(vp) C load s2 limb
- ADDSUBC r31, r0, r12
- ld r12, 24(up) C load s1 limb
- ld r0, 24(vp) C load s2 limb
- std r28, 0(rp)
- addi up, up, 32
- std r29, 8(rp)
- addi vp, vp, 32
- std r30, 16(rp)
- std r31, 24(rp)
- addi rp, rp, 32
- and r27, r27, cnd
- and r9, r9, cnd
- and r11, r11, cnd
- and r0, r0, cnd
- bdnz L(top) C decrement ctr and loop back
-
-L(end): ADDSUBC r28, r27, r7
- ADDSUBC r29, r9, r8
- ADDSUBC r30, r11, r10
- ADDSUBC r31, r0, r12
- std r28, 0(rp)
- std r29, 8(rp)
- std r30, 16(rp)
- std r31, 24(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
-
- subfe r3, r0, r0 C -cy
- GENRVAL
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/dive_1.asm b/gmp/mpn/powerpc64/mode64/dive_1.asm
index 434dde9145..a4a06da26c 100644
--- a/gmp/mpn/powerpc64/mode64/dive_1.asm
+++ b/gmp/mpn/powerpc64/mode64/dive_1.asm
@@ -1,45 +1,32 @@
dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
-dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm
-C POWER3/PPC630 13-19
-C POWER4/PPC970 16
-C POWER5 16 16
-C POWER6 37 46
-C POWER7 12 12
+C cycles/limb
+C POWER3/PPC630: 13-19
+C POWER4/PPC970: 16
+C POWER5: 16
C TODO
-C * Check if n=1 code is really an improvement. It probably isn't.
+C * Check if n=1 code is really an improvment. It probably isn't.
+C * Perhaps remove L(norm) code, it is currently unreachable.
C * Make more similar to mode1o.asm.
C INPUT PARAMETERS
@@ -53,7 +40,7 @@ ASM_START()
EXTERN(binvert_limb_table)
-PROLOGUE(mpn_divexact_1,toc)
+PROLOGUE(mpn_divexact_1)
addic. n, n, -1
ld r12, 0(up)
bne cr0, L(2)
@@ -74,6 +61,7 @@ L(7):
mtctr n
LEA( r5, binvert_limb_table)
rldicl r11, d, 63, 57
+C cmpdi cr7, r0, 0
lbzx r0, r5, r11
mulld r9, r0, r0
sldi r0, r0, 1
@@ -87,27 +75,26 @@ L(7):
sldi r0, r0, 1
mulld r9, d, r9
subf r7, r9, r0 C r7 = 1/d mod 2^64
- bne cr0, L(norm)
+C beq cr7, L(norm)
subfic r8, r10, 64 C set carry as side effect
li r5, 0
- srd r11, r12, r10
ALIGN(16)
L(loop0):
+ srd r11, r12, r10
ld r12, 8(up)
- nop
addi up, up, 8
sld r0, r12, r8
or r11, r11, r0
subfe r9, r5, r11
- srd r11, r12, r10
mulld r0, r7, r9
- mulhdu r5, r0, d
std r0, 0(rp)
addi rp, rp, 8
+ mulhdu r5, r0, d
bdnz L(loop0)
- subfe r0, r5, r11
+ srd r0, r12, r10
+ subfe r0, r5, r0
mulld r0, r7, r0
std r0, 0(rp)
blr
@@ -115,15 +102,14 @@ L(loop0):
ALIGN(16)
L(norm):
mulld r11, r12, r7
- mulhdu r5, r11, d
std r11, 0(rp)
ALIGN(16)
L(loop1):
+ mulhdu r5, r11, d
ld r9, 8(up)
addi up, up, 8
subfe r5, r5, r9
mulld r11, r7, r5
- mulhdu r5, r11, d C result not used
std r11, 8(rp)
addi rp, rp, 8
bdnz L(loop1)
diff --git a/gmp/mpn/powerpc64/mode64/diveby3.asm b/gmp/mpn/powerpc64/mode64/diveby3.asm
new file mode 100644
index 0000000000..d96f775d71
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/diveby3.asm
@@ -0,0 +1,83 @@
+dnl PowerPC-64 mpn_divexact_by3 -- mpn by 3 exact division
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 13
+C POWER4/PPC970: 13
+C POWER5: 13
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`cy', `r6')
+
+define(`xAAAAAAAB',`r7')
+define(`xAAAAAAAA', `r9')
+define(`q', `r10')
+define(`ul', `r11')
+define(`one', `r12')
+
+
+ASM_START()
+PROLOGUE(mpn_divexact_by3c)
+
+ mtctr r5
+ li r7, -0x5556 C 0xFFFFFFFFFFFFAAAA
+ ld ul, 0(up)
+ rldimi r7, r7, 16, 32 C 0xFFFFFFFFAAAAAAAA
+ rldimi r7, r7, 32, 63 C 0xAAAAAAAAAAAAAAAB = 1/3
+
+ addi r9, r7, -1 C 0xAAAAAAAAAAAAAAAA
+ li one, 1
+
+ subfc ul, cy, ul C C = (cy <= up[0])
+ subfe cy, r1, r1 C cy = -(cy > up[0])
+ bdz L(end)
+
+ ALIGN(16)
+L(top): mulld q, ul, xAAAAAAAB
+
+ ld ul, 8(up)
+ addi up, up, 8
+ addc r0, xAAAAAAAA, q C set C flag if q >= 0x5555...56
+
+ subfe cy, cy, one C cy = 1-cy-1+C
+ subfc r0, q, xAAAAAAAA C set C flag if q < 0xAAAA...AA
+
+ subfe ul, cy, ul C ul = ul-cy-1+C
+ std q, 0(rp)
+ addi rp, rp, 8
+
+ subfe cy, r1, r1
+ bdnz L(top)
+
+L(end): mulld q, ul, xAAAAAAAB
+ addc r0, xAAAAAAAA, q
+
+ subfe cy, cy, one
+ subfc r0, q, xAAAAAAAA
+
+ std q, 0(rp)
+ subfe r3, r1, r1
+ subf r3, r3, cy
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/divrem_1.asm b/gmp/mpn/powerpc64/mode64/divrem_1.asm
index b283877006..895badfe61 100644
--- a/gmp/mpn/powerpc64/mode64/divrem_1.asm
+++ b/gmp/mpn/powerpc64/mode64/divrem_1.asm
@@ -1,42 +1,29 @@
dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
-dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm frac
-C POWER3/PPC630 16-34 16-34 ~11 outdated figures
-C POWER4/PPC970 28 28 19
-C POWER5 29 29 ~19
-C POWER6 49 59 ~42
-C POWER7 24.5 23 ~14
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11
+C POWER4/PPC970 29 19
+C POWER5 29 29 ~20
C INPUT PARAMETERS
C qp = r3
@@ -56,7 +43,7 @@ ASM_START()
EXTERN_FUNC(mpn_invert_limb)
-PROLOGUE(mpn_divrem_1,toc)
+PROLOGUE(mpn_divrem_1)
mfcr r12
add. r10, r6, r4
@@ -107,6 +94,7 @@ L(71):
sld r31, r31, r27
mr r3, r30
CALL( mpn_invert_limb)
+ nop
beq- cr4, L(110)
sldi r9, r28, 3
addic. r6, r28, -2
@@ -122,23 +110,23 @@ L(71):
sldi r6, r6, 3
ALIGN(16)
L(uloop):
+ addi r11, r31, 1
ldx r8, r26, r6
- nop
mulld r0, r31, r3
mulhdu r10, r31, r3
- addi r11, r31, 1
- srd r9, r8, r5
addi r6, r6, -8
+ srd r9, r8, r5
or r9, r7, r9
addc r0, r0, r9
adde r10, r10, r11
mulld r31, r10, r30
subf r31, r31, r9
- subfc r0, r31, r0 C r <= ql
- subfe r0, r0, r0 C r0 = -(r <= ql)
- and r9, r30, r0
- add r31, r31, r9
- add r10, r0, r10 C qh -= (r >= ql)
+ subfc r0, r0, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
cmpld cr7, r31, r30
bge- cr7, L(164)
L(123):
@@ -175,19 +163,19 @@ L(110):
L(ufloop):
addi r11, r31, 1
nop
- mulld r0, r3, r31
+ mulld r7, r3, r31
mulhdu r10, r3, r31
add r10, r10, r11
mulld r31, r9, r10
ifelse(0,1,`
- subfc r0, r0, r31
+ subfc r0, r7, r31
subfe r0, r0, r0 C r0 = -(r >= ql)
not r7, r0
add r10, r7, r10 C qh -= (r >= ql)
andc r0, r30, r0
add r31, r31, r0
',`
- cmpld cr7, r31, r0
+ cmpld cr7, r31, r7
blt cr7, L(29)
add r31, r30, r31
addi r10, r10, -1
@@ -228,11 +216,12 @@ L(162):
and r0, r0, r7
subf r31, r0, r31
L(8):
+L(10):
mr r3, r30
CALL( mpn_invert_limb)
- li r27, 0
+ nop
addic. r6, r28, -1
- blt- cr0, L(110)
+ blt- cr0, L(150)
mtctr r28
sldi r6, r6, 3
ALIGN(16)
@@ -240,25 +229,70 @@ L(nloop):
addi r11, r31, 1
ldx r8, r26, r6
mulld r0, r31, r3
- mulhdu r10, r31, r3
addi r6, r6, -8
- addc r0, r0, r8
+ mulhdu r10, r31, r3
+ addc r7, r0, r8
adde r10, r10, r11
mulld r31, r10, r30
subf r31, r31, r8 C r = nl - qh * d
- subfc r0, r31, r0 C r <= ql
- subfe r0, r0, r0 C r0 = -(r <= ql)
- and r9, r30, r0
- add r31, r31, r9
- add r10, r0, r10 C qh -= (r >= ql)
+ subfc r0, r7, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
cmpld cr7, r31, r30
bge- cr7, L(167)
L(51):
std r10, 0(r29)
addi r29, r29, -8
bdnz L(nloop)
- b L(110)
+L(150):
+ addic. r9, r25, -1
+ blt- cr0, L(152)
+ mtctr r25
+ neg r9, r30
+ ALIGN(16)
+L(nfloop):
+ addi r11, r31, 1
+ nop
+ mulld r7, r3, r31
+ mulhdu r10, r3, r31
+ add r10, r10, r11
+ mulld r31, r9, r10
+ifelse(0,1,`
+ subfc r0, r7, r31
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+',`
+ cmpld cr7, r31, r7
+ blt cr7, L(28)
+ add r31, r30, r31
+ addi r10, r10, -1
+L(28):
+')
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(nfloop)
+L(152):
+ addi r1, r1, 176
+ mr r3, r31
+ ld r0, 16(r1)
+ lwz r12, 8(r1)
+ mtlr r0
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ mtcrf 8, r12
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
L(164):
subf r31, r30, r31
addi r10, r10, 1
diff --git a/gmp/mpn/powerpc64/mode64/divrem_2.asm b/gmp/mpn/powerpc64/mode64/divrem_2.asm
index 73ec23c94d..369b5c1f1d 100644
--- a/gmp/mpn/powerpc64/mode64/divrem_2.asm
+++ b/gmp/mpn/powerpc64/mode64/divrem_2.asm
@@ -3,40 +3,30 @@ dnl PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm frac
+C cycles/limb
+C norm frac
C POWER3/PPC630
-C POWER4/PPC970 ? ?
-C POWER5 37 ?
-C POWER6 62 ?
-C POWER6 30.5 ?
+C POWER4/PPC970 39* 39*
+C POWER5 39* 39*
+
+C STATUS
+C * Performace fluctuates like crazy
C INPUT PARAMETERS
C qp = r3
@@ -53,7 +43,7 @@ ASM_START()
EXTERN_FUNC(mpn_invert_limb)
-PROLOGUE(mpn_divrem_2,toc)
+PROLOGUE(mpn_divrem_2)
mflr r0
std r23, -72(r1)
std r24, -64(r1)
@@ -107,6 +97,7 @@ L(8):
blt cr0, L(18)
mr r3, r30
CALL( mpn_invert_limb)
+ nop
mulld r10, r3, r30
mulhdu r0, r3, r28
addc r8, r10, r28
@@ -130,12 +121,12 @@ L(loop):
mulld r6, r29, r3
addc r6, r6, r31
adde r8, r8, r29
- cmpd cr7, r27, r25
mulld r0, r30, r8
+ subf r31, r0, r31
mulhdu r11, r28, r8
mulld r10, r28, r8
- subf r31, r0, r31
li r7, 0
+ cmpd cr7, r27, r25
blt cr7, L(60)
ld r7, 0(r26)
addi r26, r26, -8
diff --git a/gmp/mpn/powerpc64/mode64/gcd_1.asm b/gmp/mpn/powerpc64/mode64/gcd_1.asm
deleted file mode 100644
index 8762bbbef5..0000000000
--- a/gmp/mpn/powerpc64/mode64/gcd_1.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-dnl PowerPC-64 mpn_gcd_1.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C POWER3/PPC630 ?
-C POWER4/PPC970 8.5
-C POWER5 ?
-C POWER6 10.1
-C POWER7 9.4
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up', `r3')
-define(`n', `r4')
-define(`v0', `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
- mflr r0
- std r30, -16(r1)
- std r31, -8(r1)
- std r0, 16(r1)
- stdu r1, -128(r1)
-
- ld r7, 0(up) C U low limb
- or r0, r5, r7 C x | y
-
- neg r6, r0
- and r6, r6, r0
- cntlzd r31, r6 C common twos
- subfic r31, r31, 63
-
- neg r6, r5
- and r6, r6, r5
- cntlzd r8, r6
- subfic r8, r8, 63
- srd r5, r5, r8
- mr r30, r5 C v0 saved
-
- cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
- blt L(bmod)
- CALL( mpn_mod_1)
- b L(reduced)
-L(bmod):
- li r6, 0
- CALL( mpn_modexact_1c_odd)
-L(reduced):
-
-define(`mask', `r0')dnl
-define(`a1', `r4')dnl
-define(`a2', `r5')dnl
-define(`d1', `r6')dnl
-define(`d2', `r7')dnl
-define(`cnt', `r9')dnl
-
- neg. r6, r3
- and r6, r6, r3
- cntlzd cnt, r6
- subfic cnt, cnt, 63
- li r12, 63
- bne L(mid)
- b L(end)
-
- ALIGN(16)
-L(top):
- and a1, r10, mask C d - a
- andc a2, r11, mask C a - d
- and d1, r3, mask C a
- andc d2, r30, mask C d
- or r3, a1, a2 C new a
- subf cnt, cnt, r12
- or r30, d1, d2 C new d
-L(mid): srd r3, r3, cnt
- sub. r10, r30, r3 C r10 = d - a
- subc r11, r3, r30 C r11 = a - d
- neg r8, r10
- and r8, r8, r10
- subfe mask, mask, mask
- cntlzd cnt, r8
- bne L(top)
-
-L(end): sld r3, r30, r31
-
- addi r1, r1, 128
- ld r0, 16(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/gmp-mparam.h
index f8305f4720..4eb8887724 100644
--- a/gmp/mpn/powerpc64/mode64/gmp-mparam.h
+++ b/gmp/mpn/powerpc64/mode64/gmp-mparam.h
@@ -5,73 +5,62 @@ Copyright 2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* 1600MHz PPC970 */
/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 93
-#define MUL_TOOM44_THRESHOLD 135
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 155
-#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 32
-#define SQR_TOOM3_THRESHOLD 74
-#define SQR_TOOM4_THRESHOLD 136
+#define SQR_BASECASE_THRESHOLD 5
+#define SQR_KARATSUBA_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 154
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 44
-#define MULLO_MUL_N_THRESHOLD 234
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 40
+#define MULLOW_MUL_N_THRESHOLD 234
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
-#define DIV_DC_THRESHOLD 33
-#define POWM_THRESHOLD 89
+#define DIV_DC_THRESHOLD 32
+#define POWM_THRESHOLD 93
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 93
-#define GCD_DC_THRESHOLD 237
-#define GCDEXT_DC_THRESHOLD 273
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 96
+#define GCD_DC_THRESHOLD 242
+#define GCDEXT_DC_THRESHOLD 353
#define JACOBI_BASE_METHOD 1
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1_THRESHOLD 6
+#define MOD_1_1_THRESHOLD 7
#define MOD_1_2_THRESHOLD 9
-#define MOD_1_4_THRESHOLD 23
+#define MOD_1_4_THRESHOLD 44
#define USE_PREINV_DIVREM_1 0
-#define USE_PREINV_MOD_1 0
+#define USE_PREINV_MOD_1 1
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 650
-#define SET_STR_PRECOMPUTE_THRESHOLD 1713
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 532
+#define SET_STR_PRECOMPUTE_THRESHOLD 1790
#define MUL_FFT_TABLE { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 }
#define MUL_FFT_MODF_THRESHOLD 304
@@ -80,3 +69,9 @@ see https://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
#define SQR_FFT_MODF_THRESHOLD 272
#define SQR_FFT_THRESHOLD 2688
+
+/* These tables are now obsolete */
+
+#define MUL_FFT_TABLE2 {{1,4}, {209,5}, {513,6}, {1217,7}, {2561,8}, {3329,7}, {3457,8}, {3841,7}, {4097,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {22017,10}, {23553,9}, {26113,11}, {30721,10}, {48129,9}, {50689,10}, {56321,11}, {61441,10}, {81409,11}, {96257,10}, {97281,9}, {98305,10}, {99329,9}, {101889,10}, {106241,12}, {126977,11}, {129025,10}, {146945,11}, {161793,10}, {179969,11}, {194561,10}, {212737,11}, {227329,10}, {228865,12}, {258049,11}, {359937,12}, {389121,11}, {458241,13}, {516097,12}, {520193,11}, {588801,12}, {651265,11}, {720385,12}, {782337,11}, {851457,12}, {913409,11}, {982529,12}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {209,5}, {481,6}, {1089,7}, {3073,8}, {6913,9}, {7681,8}, {8449,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {26113,11}, {30721,10}, {31745,9}, {34305,10}, {56321,11}, {63489,10}, {81409,11}, {96257,9}, {96769,10}, {98049,12}, {126977,11}, {129025,10}, {146945,11}, {161793,10}, {212481,12}, {258049,11}, {267265,10}, {270337,11}, {272385,10}, {274433,11}, {424961,13}, {516097,12}, {520193,11}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/powerpc64/mode64/invert_limb.asm b/gmp/mpn/powerpc64/mode64/invert_limb.asm
index dfdba6451e..02a67a3979 100644
--- a/gmp/mpn/powerpc64/mode64/invert_limb.asm
+++ b/gmp/mpn/powerpc64/mode64/invert_limb.asm
@@ -1,88 +1,109 @@
dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
-dnl Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb (approximate)
-C POWER3/PPC630 80
-C POWER4/PPC970 86
-C POWER5 86
-C POWER6 170
-C POWER7 66
+C cycles/limb
+C POWER3/PPC630: ?
+C POWER4/PPC970: 75 (including call+ret)
+
+C TODO:
+C * Pair multiply instructions.
ASM_START()
-PROLOGUE(mpn_invert_limb,toc)
+PROLOGUE(mpn_invert_limb)
LEAL( r12, approx_tab)
- srdi r9, r3, 32
- rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe
- srdi r10, r3, 24 C d >> 24
- lis r11, 0x1000
- rldicl r8, r3, 0, 63 C d mod 2
- addi r10, r10, 1 C d40
- sldi r11, r11, 32 C 2^60
- srdi r7, r3, 1 C d/2
- add r7, r7, r8 C d63 = ceil(d/2)
- neg r8, r8 C mask = -(d mod 2)
- lhzx r0, r9, r12
- mullw r9, r0, r0 C v0*v0
- sldi r6, r0, 11 C v0 << 11
- addi r0, r6, -1 C (v0 << 11) - 1
- mulld r9, r9, r10 C v0*v0*d40
- srdi r9, r9, 40 C v0*v0*d40 >> 40
- subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
- mulld r0, r9, r10 C v1*d40
- sldi r6, r9, 13 C v1 << 13
- subf r0, r0, r11 C 2^60 - v1*d40
- mulld r0, r0, r9 C v1 * (2^60 - v1*d40)
- srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47
- add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
- mulld r11, r0, r7 C v2 * d63
- srdi r10, r0, 1 C v2 >> 1
- sldi r9, r0, 31 C v2 << 31
- and r8, r10, r8 C (v2 >> 1) & mask
- subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63
- mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
- srdi r0, r0, 1 C p1 >> 1
- add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1)
- nop
- mulld r11, r0, r3
- mulhdu r9, r0, r3
- addc r10, r11, r3
- adde r3, r9, r3
- subf r3, r3, r0
+
+ srdi r11, r3, 32 C r11 = d >> 32
+ rlwinm r9, r11, 10, 23, 30 C r9 = ((d >> 55) & 0xff) << 1
+ lhzx r0, r12, r9 C load initial approximation
+ rldic r10, r0, 6, 42
+ mulld r8, r10, r10
+ sldi r9, r10, 17
+ mulld r0, r8, r11
+ srdi r0, r0, 31
+ subf r10, r0, r9
+ mulld r8, r10, r10
+ sldi r11, r10, 33
+ mulhdu r0, r8, r3
+ sldi r9, r0, 1
+ subf r10, r9, r11
+ sldi r11, r10, 2
+ mulhdu r0, r10, r10
+ mulld r8, r10, r10
+ mulhdu r10, r8, r3
+ mulld r9, r0, r3
+ mulhdu r0, r0, r3
+ addc r8, r9, r10
+ addze r10, r0
+ srdi r0, r8, 62
+ rldimi r0, r10, 2, 0
+ sldi r9, r8, 2
+ subfic r10, r9, 0
+ subfe r8, r0, r11
+ mulhdu r10, r3, r8
+ add r10, r10, r3
+ mulld r9, r3, r8
+ subf r11, r10, r8
+ addi r0, r10, 1
+ addi r8, r11, -1
+ and r0, r3, r0
+ addc r11, r9, r0
+ addze r10, r10
+ addc r0, r11, r3
+ addze r10, r10
+ subf r3, r10, r8
blr
EPILOGUE()
DEF_OBJECT(approx_tab)
-forloop(i,256,512-1,dnl
-` .short eval(0x7fd00/i)
-')dnl
+ .short 1023,1020,1016,1012,1008,1004,1000,996
+ .short 992,989,985,981,978,974,970,967
+ .short 963,960,956,953,949,946,942,939
+ .short 936,932,929,926,923,919,916,913
+ .short 910,907,903,900,897,894,891,888
+ .short 885,882,879,876,873,870,868,865
+ .short 862,859,856,853,851,848,845,842
+ .short 840,837,834,832,829,826,824,821
+ .short 819,816,814,811,809,806,804,801
+ .short 799,796,794,791,789,787,784,782
+ .short 780,777,775,773,771,768,766,764
+ .short 762,759,757,755,753,751,748,746
+ .short 744,742,740,738,736,734,732,730
+ .short 728,726,724,722,720,718,716,714
+ .short 712,710,708,706,704,702,700,699
+ .short 697,695,693,691,689,688,686,684
+ .short 682,680,679,677,675,673,672,670
+ .short 668,667,665,663,661,660,658,657
+ .short 655,653,652,650,648,647,645,644
+ .short 642,640,639,637,636,634,633,631
+ .short 630,628,627,625,624,622,621,619
+ .short 618,616,615,613,612,611,609,608
+ .short 606,605,604,602,601,599,598,597
+ .short 595,594,593,591,590,589,587,586
+ .short 585,583,582,581,579,578,577,576
+ .short 574,573,572,571,569,568,567,566
+ .short 564,563,562,561,560,558,557,556
+ .short 555,554,553,551,550,549,548,547
+ .short 546,544,543,542,541,540,539,538
+ .short 537,536,534,533,532,531,530,529
+ .short 528,527,526,525,524,523,522,521
+ .short 520,519,518,517,516,515,514,513
END_OBJECT(approx_tab)
ASM_END()
diff --git a/gmp/mpn/powerpc64/mode64/mod_1_1.asm b/gmp/mpn/powerpc64/mode64/mod_1_1.asm
deleted file mode 100644
index 873373054f..0000000000
--- a/gmp/mpn/powerpc64/mode64/mod_1_1.asm
+++ /dev/null
@@ -1,164 +0,0 @@
-dnl PowerPC-64 mpn_mod_1_1p
-
-dnl Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 17
-C POWER5 16
-C POWER6 30
-C POWER7 10.2
-
-C TODO
-C * Optimise, in particular the cps function. This was compiler-generated and
-C then hand optimised.
-
-C INPUT PARAMETERS
-define(`ap', `r3')
-define(`n', `r4')
-define(`d', `r5')
-define(`cps', `r6')
-
-ASM_START()
-
-EXTERN_FUNC(mpn_invert_limb)
-
-PROLOGUE(mpn_mod_1_1p)
- sldi r10, r4, 3
- addi r4, r4, -1
- add r3, r3, r10
- ld r0, 16(r6) C B1modb
- ld r12, 24(r6) C B2modb
- ld r9, -8(r3)
- ld r10, -16(r3)
- mtctr r4
- mulhdu r8, r9, r0
- mulld r7, r9, r0
- addc r11, r7, r10
- addze r9, r8
- bdz L(end)
-
- ALIGN(16)
-L(top): ld r4, -24(r3)
- addi r3, r3, -8
- nop
- mulld r10, r11, r0
- mulld r8, r9, r12
- mulhdu r11, r11, r0
- mulhdu r9, r9, r12
- addc r7, r10, r4
- addze r10, r11
- addc r11, r8, r7
- adde r9, r9, r10
- bdnz L(top)
-
-L(end):
-ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
-` lwz r0, 8(r6)',
-` lwz r0, 12(r6)')
- ld r3, 0(r6)
- cmpdi cr7, r0, 0
- beq- cr7, L(4)
- subfic r10, r0, 64
- sld r9, r9, r0
- srd r10, r11, r10
- or r9, r10, r9
-L(4): subfc r10, r5, r9
- subfe r10, r10, r10
- nand r10, r10, r10
- sld r11, r11, r0
- and r10, r10, r5
- subf r9, r10, r9
- mulhdu r10, r9, r3
- mulld r3, r9, r3
- addi r9, r9, 1
- addc r8, r3, r11
- adde r3, r10, r9
- mulld r3, r3, r5
- subf r3, r3, r11
- cmpld cr7, r8, r3
- bge cr7, L(5) C FIXME: Make branch-less
- add r3, r3, r5
-L(5): cmpld cr7, r3, r5
- bge- cr7, L(10)
- srd r3, r3, r0
- blr
-
-L(10): subf r3, r5, r3
- srd r3, r3, r0
- blr
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps,toc)
- mflr r0
- std r29, -24(r1)
- std r30, -16(r1)
- std r31, -8(r1)
- cntlzd r31, r4
- std r0, 16(r1)
- extsw r31, r31
- mr r29, r3
- stdu r1, -144(r1)
- sld r30, r4, r31
- mr r3, r30
- CALL( mpn_invert_limb)
- cmpdi cr7, r31, 0
- neg r0, r30
- beq- cr7, L(13)
- subfic r11, r31, 64
- li r0, 1
- neg r9, r30
- srd r11, r3, r11
- sld r0, r0, r31
- or r0, r11, r0
- mulld r0, r0, r9
-L(13): mulhdu r9, r0, r3
- mulld r11, r0, r3
- add r9, r0, r9
- nor r9, r9, r9
- mulld r9, r9, r30
- cmpld cr7, r11, r9
- bge cr7, L(14)
- add r9, r9, r30
-L(14): addi r1, r1, 144
- srd r0, r0, r31
- std r31, 8(r29)
- std r3, 0(r29)
- std r0, 16(r29)
- ld r0, 16(r1)
- srd r9, r9, r31
- ld r30, -16(r1)
- ld r31, -8(r1)
- std r9, 24(r29)
- ld r29, -24(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/mod_1_4.asm b/gmp/mpn/powerpc64/mode64/mod_1_4.asm
deleted file mode 100644
index 0b7d6bf699..0000000000
--- a/gmp/mpn/powerpc64/mode64/mod_1_4.asm
+++ /dev/null
@@ -1,270 +0,0 @@
-dnl PowerPC-64 mpn_mod_1s_4p
-
-dnl Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 9
-C POWER5 9
-C POWER6 13
-C POWER7 3.5
-
-C TODO
-C * Optimise, in particular the cps function. This was compiler-generated and
-C then hand optimised.
-
-C INPUT PARAMETERS
-define(`ap', `r3')
-define(`n', `r4')
-define(`d', `r5')
-define(`cps', `r6')
-
-ASM_START()
-
-EXTERN_FUNC(mpn_invert_limb)
-
-PROLOGUE(mpn_mod_1s_4p)
- std r23, -72(r1)
- ld r23, 48(cps)
- std r24, -64(r1)
- std r25, -56(r1)
- ld r24, 32(cps)
- ld r25, 24(cps)
- std r26, -48(r1)
- std r27, -40(r1)
- ld r26, 16(cps)
- std r28, -32(r1)
- std r29, -24(r1)
- std r30, -16(r1)
- std r31, -8(r1)
- ld r30, 40(cps)
-
- rldicl. r0, n, 0,62
- sldi r31, n, 3
- add ap, ap, r31 C make ap point at end of operand
-
- cmpdi cr7, r0, 2
- beq cr0, L(b00)
- blt cr7, L(b01)
- beq cr7, L(b10)
-
-L(b11): ld r11, -16(ap)
- ld r9, -8(ap)
- ld r0, -24(ap)
- mulhdu r27, r11, r26
- mulld r8, r11, r26
- mulhdu r11, r9, r25
- mulld r9, r9, r25
- addc r31, r8, r0
- addze r10, r27
- addc r0, r9, r31
- adde r9, r11, r10
- addi ap, ap, -40
- b L(6)
-
- ALIGN(16)
-L(b00): ld r11, -24(ap)
- ld r10, -16(ap)
- ld r9, -8(ap)
- ld r0, -32(ap)
- mulld r8, r11, r26
- mulhdu r7, r10, r25
- mulhdu r27, r11, r26
- mulhdu r11, r9, r24
- mulld r10, r10, r25
- mulld r9, r9, r24
- addc r31, r8, r0
- addze r0, r27
- addc r8, r31, r10
- adde r10, r0, r7
- addc r0, r9, r8
- adde r9, r11, r10
- addi ap, ap, -48
- b L(6)
-
- ALIGN(16)
-L(b01): li r9, 0
- ld r0, -8(ap)
- addi ap, ap, -24
- b L(6)
-
- ALIGN(16)
-L(b10): ld r9, -8(ap)
- ld r0, -16(ap)
- addi ap, ap, -32
-
- ALIGN(16)
-L(6): addi r10, n, 3
- srdi r7, r10, 2
- mtctr r7
- bdz L(end)
-
- ALIGN(16)
-L(top): ld r31, -16(ap)
- ld r10, -8(ap)
- ld r11, 8(ap)
- ld r12, 0(ap)
- mulld r29, r0, r30 C rl * B4modb
- mulhdu r0, r0, r30 C rl * B4modb
- mulhdu r27, r10, r26
- mulld r10, r10, r26
- mulhdu r7, r9, r23 C rh * B5modb
- mulld r9, r9, r23 C rh * B5modb
- mulhdu r28, r11, r24
- mulld r11, r11, r24
- mulhdu r4, r12, r25
- mulld r12, r12, r25
- addc r8, r10, r31
- addze r10, r27
- addi ap, ap, -32
- addc r27, r8, r12
- adde r12, r10, r4
- addc r11, r27, r11
- adde r31, r12, r28
- addc r12, r11, r29
- adde r4, r31, r0
- addc r0, r9, r12
- adde r9, r7, r4
- bdnz L(top)
-
-L(end):
-ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
-` lwz r3, 8(cps)',
-` lwz r3, 12(cps)')
- mulld r10, r9, r26
- mulhdu r9, r9, r26
- addc r11, r0, r10
- addze r9, r9
- ld r10, 0(cps)
- subfic r8, r3, 64
- sld r9, r9, r3
- srd r8, r11, r8
- sld r11, r11, r3
- or r9, r8, r9
- mulld r0, r9, r10
- mulhdu r10, r9, r10
- addi r9, r9, 1
- addc r8, r0, r11
- adde r0, r10, r9
- mulld r0, r0, d
- subf r0, r0, r11
- cmpld cr7, r8, r0
- bge cr7, L(9)
- add r0, r0, d
-L(9): cmpld cr7, r0, d
- bge- cr7, L(16)
-L(10): srd r3, r0, r3
- ld r23, -72(r1)
- ld r24, -64(r1)
- ld r25, -56(r1)
- ld r26, -48(r1)
- ld r27, -40(r1)
- ld r28, -32(r1)
- ld r29, -24(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- blr
-
-L(16): subf r0, d, r0
- b L(10)
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1s_4p_cps,toc)
- mflr r0
- std r29, -24(r1)
- std r30, -16(r1)
- mr r29, r3
- std r0, 16(r1)
- std r31, -8(r1)
- stdu r1, -144(r1)
- cntlzd r31, r4
- sld r30, r4, r31
- mr r3, r30
- CALL( mpn_invert_limb)
- subfic r9, r31, 64
- li r10, 1
- sld r10, r10, r31
- srd r9, r3, r9
- neg r0, r30
- or r10, r10, r9
- mulld r10, r10, r0
- mulhdu r11, r10, r3
- nor r11, r11, r11
- subf r11, r10, r11
- mulld r11, r11, r30
- mulld r0, r10, r3
- cmpld cr7, r0, r11
- bge cr7, L(18)
- add r11, r11, r30
-L(18): mulhdu r9, r11, r3
- add r9, r11, r9
- nor r9, r9, r9
- mulld r9, r9, r30
- mulld r0, r11, r3
- cmpld cr7, r0, r9
- bge cr7, L(19)
- add r9, r9, r30
-L(19): mulhdu r0, r9, r3
- add r0, r9, r0
- nor r0, r0, r0
- mulld r0, r0, r30
- mulld r8, r9, r3
- cmpld cr7, r8, r0
- bge cr7, L(20)
- add r0, r0, r30
-L(20): mulhdu r8, r0, r3
- add r8, r0, r8
- nor r8, r8, r8
- mulld r8, r8, r30
- mulld r7, r0, r3
- cmpld cr7, r7, r8
- bge cr7, L(21)
- add r8, r8, r30
-L(21): srd r0, r0, r31
- addi r1, r1, 144
- srd r8, r8, r31
- srd r10, r10, r31
- srd r11, r11, r31
- std r0, 40(r29)
- std r31, 8(r29)
- srd r9, r9, r31
- ld r0, 16(r1)
- ld r30, -16(r1)
- std r8, 48(r29)
- std r3, 0(r29)
- mtlr r0
- ld r31, -8(r1)
- std r10, 16(r29)
- std r11, 24(r29)
- std r9, 32(r29)
- ld r29, -24(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm b/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
index c35e0e37a4..ca46c3933b 100644
--- a/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
+++ b/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -1,41 +1,28 @@
-dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1.
+dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^24-1.
dnl Copyright 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.33
-C POWER4/PPC970 1.5
-C POWER5 1.32
-C POWER6 2.35
-C POWER7 1
+C cycles/limb
+C POWER3/PPC630: 1.33
+C POWER4/PPC970: 1.5
+C POWER5: 1.57
C INPUT PARAMETERS
define(`up',`r3')
diff --git a/gmp/mpn/powerpc64/mode64/mode1o.asm b/gmp/mpn/powerpc64/mode64/mode1o.asm
index 726339a931..95aa2870da 100644
--- a/gmp/mpn/powerpc64/mode64/mode1o.asm
+++ b/gmp/mpn/powerpc64/mode64/mode1o.asm
@@ -3,43 +3,30 @@ dnl PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder.
dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 13-19
-C POWER4/PPC970 16
-C POWER5 16
-C POWER6 ?
-C POWER7 12
+C cycles/limb
+C POWER3/PPC630: 13-19
+C POWER4/PPC970: 16
+C POWER5: 16
C TODO
-C * Check if n=1 code is really an improvement. It probably isn't.
-C * Make more similar to dive_1.asm.
+C * Check if n=1 code is really an improvment. It probably isn't.
+C * Make more similar to dive_1.asm..
C INPUT PARAMETERS
define(`up', `r3')
@@ -52,7 +39,7 @@ ASM_START()
EXTERN(binvert_limb_table)
-PROLOGUE(mpn_modexact_1c_odd,toc)
+PROLOGUE(mpn_modexact_1c_odd)
addic. n, n, -1 C set carry as side effect
ld r8, 0(up)
bne cr0, L(2)
diff --git a/gmp/mpn/powerpc64/mode64/mul_1.asm b/gmp/mpn/powerpc64/mode64/mul_1.asm
index 27a8f8fb4d..8f644d8710 100644
--- a/gmp/mpn/powerpc64/mode64/mul_1.asm
+++ b/gmp/mpn/powerpc64/mode64/mul_1.asm
@@ -1,42 +1,30 @@
dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
dnl the result in a second limb vector.
-dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 7.25? not updated for last file revision
-C POWER5 7.25
-C POWER6 14
-C POWER7 2.9
+C cycles/limb
+C POWER3/PPC630: 6-18
+C POWER4/PPC970: 7.25
+C POWER5: 7.75
C TODO
C * Try to reduce the number of needed live registers (at least r5 and r10
@@ -130,18 +118,26 @@ L(b10): ld r27, 8(up)
L(top): mulld r0, r26, r6
mulhdu r5, r26, r6
+ ld r26, 0(up)
+ nop
+
mulld r7, r27, r6
mulhdu r8, r27, r6
- ld r26, 0(up)
ld r27, 8(up)
+ nop
+
adde r0, r0, r12
adde r7, r7, r5
+
mulld r9, r26, r6
mulhdu r10, r26, r6
+ ld r26, 16(up)
+ nop
+
mulld r11, r27, r6
mulhdu r12, r27, r6
- ld r26, 16(up)
ld r27, 24(up)
+
std r0, 0(rp)
adde r9, r9, r8
std r7, 8(rp)
@@ -155,10 +151,13 @@ L(top): mulld r0, r26, r6
L(end): mulld r0, r26, r6
mulhdu r5, r26, r6
+
mulld r7, r27, r6
mulhdu r8, r27, r6
+
adde r0, r0, r12
adde r7, r7, r5
+
std r0, 0(rp)
std r7, 8(rp)
L(ret): addze r3, r8
diff --git a/gmp/mpn/powerpc64/mode64/mul_basecase.asm b/gmp/mpn/powerpc64/mode64/mul_basecase.asm
index 18731879e4..cea5417eb2 100644
--- a/gmp/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/gmp/mpn/powerpc64/mode64/mul_basecase.asm
@@ -1,40 +1,30 @@
-dnl PowerPC-64 mpn_mul_basecase.
+dnl PowerPC-64 mpn_basecase.
-dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8
-C POWER5 8
-C POWER6 24
+C cycles/limb
+C POWER3/PPC630: 6-18
+C POWER4/PPC970: 8
+C POWER5: 8
+
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h
deleted file mode 100644
index 61a437b6e6..0000000000
--- a/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 2008-2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
-#define USE_PREINV_DIVREM_1 0
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 10
-#define MUL_TOOM33_THRESHOLD 33
-#define MUL_TOOM44_THRESHOLD 46
-#define MUL_TOOM6H_THRESHOLD 77
-#define MUL_TOOM8H_THRESHOLD 139
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 47
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 14
-#define SQR_TOOM3_THRESHOLD 45
-#define SQR_TOOM4_THRESHOLD 64
-#define SQR_TOOM6_THRESHOLD 85
-#define SQR_TOOM8_THRESHOLD 139
-
-#define MULMID_TOOM42_THRESHOLD 22
-
-#define MULMOD_BNM1_THRESHOLD 8
-#define SQRMOD_BNM1_THRESHOLD 10
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
- { 7, 7}, { 15, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 23,10}, { 15, 9}, \
- { 35, 8}, { 71,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
- { 79,10}, { 55,11}, { 31,10}, { 63, 9}, \
- { 127,10}, { 71, 9}, { 143, 8}, { 287,10}, \
- { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \
- { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \
- { 351,11}, { 95,10}, { 191, 9}, { 383,11}, \
- { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 175,10}, { 351,12}, { 95,11}, { 191,10}, \
- { 383, 9}, { 767,11}, { 223,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
- { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \
- { 639,11}, { 351,12}, { 191,11}, { 383,10}, \
- { 767,12}, { 223,11}, { 447,10}, { 895,13}, \
- { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
- { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
- { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
- { 447,11}, { 895,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 120
-#define MUL_FFT_THRESHOLD 2688
-
-#define SQR_FFT_MODF_THRESHOLD 188 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 188, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
- { 13, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \
- { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \
- { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79, 8}, { 159,10}, { 47, 9}, { 95, 8}, \
- { 191,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \
- { 79, 9}, { 159,11}, { 47,10}, { 95, 9}, \
- { 191,12}, { 31,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511,10}, { 143, 9}, { 287,11}, \
- { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
- { 175,11}, { 95,10}, { 191, 9}, { 383,11}, \
- { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \
- { 767,11}, { 223,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 287,10}, { 575,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 767,12}, { 223,11}, \
- { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
- { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \
- { 319,11}, { 639,12}, { 351,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 447,11}, { 895,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 118
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 27
-#define MULLO_MUL_N_THRESHOLD 2511
-
-#define DC_DIV_QR_THRESHOLD 23
-#define DC_DIVAPPR_Q_THRESHOLD 87
-#define DC_BDIV_QR_THRESHOLD 27
-#define DC_BDIV_Q_THRESHOLD 60
-
-#define INV_MULMOD_BNM1_THRESHOLD 27
-#define INV_NEWTON_THRESHOLD 91
-#define INV_APPR_THRESHOLD 91
-
-#define BINV_NEWTON_THRESHOLD 115
-#define REDC_1_TO_REDC_N_THRESHOLD 31
-
-#define MU_DIV_QR_THRESHOLD 551
-#define MU_DIVAPPR_Q_THRESHOLD 551
-#define MUPI_DIV_QR_THRESHOLD 42
-#define MU_BDIV_QR_THRESHOLD 483
-#define MU_BDIV_Q_THRESHOLD 492
-
-#define POWM_SEC_TABLE 2,23,140,556,713,746
-
-#define MATRIX22_STRASSEN_THRESHOLD 8
-#define HGCD_THRESHOLD 56
-#define HGCD_APPR_THRESHOLD 51
-#define HGCD_REDUCE_THRESHOLD 688
-#define GCD_DC_THRESHOLD 333
-#define GCDEXT_DC_THRESHOLD 126
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 375
-#define SET_STR_PRECOMPUTE_THRESHOLD 812
-
-#define FAC_DSC_THRESHOLD 351
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h
deleted file mode 100644
index d909b292bb..0000000000
--- a/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 2008-2010, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 1800 MHz PPC970 */
-/* FFT tuning limit = 10000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
-#define USE_PREINV_DIVREM_1 0
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 34
-
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 53
-#define MUL_TOOM44_THRESHOLD 136
-#define MUL_TOOM6H_THRESHOLD 197
-#define MUL_TOOM8H_THRESHOLD 296
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 79
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 24
-#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 142
-#define SQR_TOOM6_THRESHOLD 270
-#define SQR_TOOM8_THRESHOLD 430
-
-#define MULMID_TOOM42_THRESHOLD 32
-
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \
- { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 63, 9}, { 127,10}, { 87,11}, \
- { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 135, 9}, { 271,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 167, 9}, { 335,11}, \
- { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
- { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 303, 9}, { 607,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671,10}, { 351,12}, { 95,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
- { 415, 9}, { 831,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
- { 287,10}, { 575,11}, { 303,10}, { 607,11}, \
- { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,12}, { 223,10}, \
- { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
- { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 415,11}, { 895,12}, \
- { 479,14}, { 127,13}, { 255,12}, { 607,13}, \
- { 319,12}, { 703,13}, { 383,12}, { 895,14}, \
- { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
- { 1151,13}, { 703,14}, { 383,13}, { 895,15}, \
- { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1087,12}, { 2175,13}, { 1151,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 165
-#define MUL_FFT_THRESHOLD 9088
-
-#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 14, 6}, { 29, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \
- { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \
- { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \
- { 319, 8}, { 639,10}, { 175, 9}, { 351,11}, \
- { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \
- { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
- { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \
- { 767,11}, { 207,10}, { 415, 9}, { 831,11}, \
- { 223,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
- { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,12}, { 223,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
- { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 895,12}, { 479,14}, { 127,13}, \
- { 255,12}, { 607,13}, { 319,12}, { 703,13}, \
- { 383,12}, { 927,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
- { 1279,13}, { 703,14}, { 383,13}, { 895,12}, \
- { 1791,15}, { 255,14}, { 511,13}, { 1023,12}, \
- { 2047,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 162
-#define SQR_FFT_THRESHOLD 6272
-
-#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 44
-#define MULLO_MUL_N_THRESHOLD 18087
-
-#define DC_DIV_QR_THRESHOLD 42
-#define DC_DIVAPPR_Q_THRESHOLD 167
-#define DC_BDIV_QR_THRESHOLD 46
-#define DC_BDIV_Q_THRESHOLD 110
-
-#define INV_MULMOD_BNM1_THRESHOLD 30
-#define INV_NEWTON_THRESHOLD 181
-#define INV_APPR_THRESHOLD 173
-
-#define BINV_NEWTON_THRESHOLD 214
-#define REDC_1_TO_REDC_N_THRESHOLD 56
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 1017
-#define MUPI_DIV_QR_THRESHOLD 92
-#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1017
-
-#define POWM_SEC_TABLE 2,22,87,579,1925
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 109
-#define HGCD_APPR_THRESHOLD 115
-#define HGCD_REDUCE_THRESHOLD 4633
-#define GCD_DC_THRESHOLD 318
-#define GCDEXT_DC_THRESHOLD 242
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 802
-#define SET_STR_PRECOMPUTE_THRESHOLD 1712
-
-#define FAC_DSC_THRESHOLD 507
-#define FAC_ODD_THRESHOLD 25
diff --git a/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h
deleted file mode 100644
index 15b009c357..0000000000
--- a/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* POWER5 (friggms.hpc.ntnu.no) */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
-#define USE_PREINV_DIVREM_1 0
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 40
-
-#define MUL_TOOM22_THRESHOLD 21
-#define MUL_TOOM33_THRESHOLD 24
-#define MUL_TOOM44_THRESHOLD 70
-#define MUL_TOOM6H_THRESHOLD 262
-#define MUL_TOOM8H_THRESHOLD 393
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 24
-#define SQR_TOOM3_THRESHOLD 81
-#define SQR_TOOM4_THRESHOLD 142
-#define SQR_TOOM6_THRESHOLD 189
-#define SQR_TOOM8_THRESHOLD 284
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \
- { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
- { 143,10}, { 287, 9}, { 575,10}, { 319,12}, \
- { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543,11}, { 287,10}, { 575, 9}, { 1151,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \
- { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
- { 447,11}, { 895,14}, { 127,13}, { 255,12}, \
- { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
- { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
- { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
- { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
- { 2431,10}, { 4863,13}, { 639,12}, { 1343,13}, \
- { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
- { 1535,13}, { 831,12}, { 1663,13}, { 959,12}, \
- { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \
- { 1087,12}, { 2175,13}, { 1215,12}, { 2431,11}, \
- { 4863,14}, { 639,13}, { 1343,12}, { 2687,13}, \
- { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \
- { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
- { 895,13}, { 1919,12}, { 3839,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
- { 1663,13}, { 3327,14}, { 1919,13}, { 3839,16}, \
- { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
- { 1279,14}, { 2943,12}, { 11775,15}, { 1535,14}, \
- { 3327,15}, { 1791,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 208
-#define MUL_FFT_THRESHOLD 4224
-
-#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 272, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 19, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
- { 31,10}, { 71, 9}, { 143,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143,11}, { 79,10}, { 159, 9}, { 319,10}, \
- { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
- { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319,11}, { 175,10}, { 351,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,10}, { 895,11}, \
- { 479,10}, { 959,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,12}, { 287,11}, { 575,12}, \
- { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
- { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 575,13}, { 319,12}, \
- { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 959,12}, { 1919,15}, { 255,14}, { 511,13}, \
- { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \
- { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,12}, { 4863,14}, { 1407,13}, { 2815,15}, \
- { 767,14}, { 1663,13}, { 3327,14}, { 1919,13}, \
- { 3839,16}, { 511,15}, { 1023,14}, { 2431,13}, \
- { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
- { 11775,15}, { 1535,14}, { 3327,15}, { 1791,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 190
-#define SQR_FFT_THRESHOLD 3264
-
-#define MULLO_BASECASE_THRESHOLD 6
-#define MULLO_DC_THRESHOLD 60
-#define MULLO_MUL_N_THRESHOLD 7463
-
-#define DC_DIV_QR_THRESHOLD 58
-#define DC_DIVAPPR_Q_THRESHOLD 232
-#define DC_BDIV_QR_THRESHOLD 78
-#define DC_BDIV_Q_THRESHOLD 238
-
-#define INV_MULMOD_BNM1_THRESHOLD 92
-#define INV_NEWTON_THRESHOLD 155
-#define INV_APPR_THRESHOLD 157
-
-#define BINV_NEWTON_THRESHOLD 155
-#define REDC_1_TO_REDC_N_THRESHOLD 61
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 979
-#define MUPI_DIV_QR_THRESHOLD 79
-#define MU_BDIV_QR_THRESHOLD 823
-#define MU_BDIV_Q_THRESHOLD 942
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 74
-#define HGCD_APPR_THRESHOLD 155
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 351
-#define GCDEXT_DC_THRESHOLD 288
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 650
-#define SET_STR_PRECOMPUTE_THRESHOLD 1585
-
-#define FAC_DSC_THRESHOLD 662
-#define FAC_ODD_THRESHOLD 28
diff --git a/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm
deleted file mode 100644
index 5a85f84f4a..0000000000
--- a/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
-
-dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mpn_addmul_1 mpn_submul_1
-C cycles/limb cycles/limb
-C POWER3/PPC630 ? ?
-C POWER4/PPC970 ? ?
-C POWER5 ? ?
-C POWER6 12.25 12.8
-C POWER7 ? ?
-
-C TODO
-C * Reduce register usage.
-C * Schedule function entry code.
-C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
-C would bring us to 9 c/l.
-C * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`v0', `r6')
-
-ifdef(`OPERATION_addmul_1',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_addmul_1)
- define(func_nc, mpn_addmul_1c) C FIXME: not really supported
- define(AM, `$1')
- define(SM, `')
- define(CLRRSC, `addic $1, r0, 0')
-')
-ifdef(`OPERATION_submul_1',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_submul_1)
- define(func_nc, mpn_submul_1c) C FIXME: not really supported
- define(AM, `')
- define(SM, `$1')
- define(CLRRSC, `subfc $1, r0, r0')
-')
-
-ASM_START()
-PROLOGUE(func)
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 3 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C copy loop count into ctr
- beq cr0, L(b0)
- blt cr6, L(b1)
- beq cr6, L(b2)
-
-L(b3): ld r8, 0(up)
- ld r7, 8(up)
- ld r27, 16(up)
- addi up, up, 16
- addi rp, rp, 16
- mulld r5, r8, v0
- mulhdu r8, r8, v0
- mulld r9, r7, v0
- mulhdu r7, r7, v0
- mulld r11, r27, v0
- mulhdu r27, r27, v0
- ld r29, -16(rp)
- ld r30, -8(rp)
- ld r31, 0(rp)
- addc r9, r9, r8
- adde r11, r11, r7
- addze r12, r27
- ADDSUB r5, r5, r29
- b L(l3)
-
-L(b2): ld r7, 0(up)
- ld r27, 8(up)
- addi up, up, 8
- addi rp, rp, 8
- mulld r9, r7, v0
- mulhdu r7, r7, v0
- mulld r11, r27, v0
- mulhdu r27, r27, v0
- ld r30, -8(rp)
- ld r31, 0(rp)
- addc r11, r11, r7
- addze r12, r27
- ADDSUB r9, r9, r30
- b L(l2)
-
-L(b1): ld r27, 0(up)
- ld r31, 0(rp)
- mulld r11, r27, v0
- mulhdu r12, r27, v0
- ADDSUB r11, r11, r31
- b L(l1)
-
-L(b0): addi up, up, -8
- addi rp, rp, -8
- CLRRSC( r12) C clear r12 and clr/set cy
-
- ALIGN(32)
-L(top):
-SM(` subfe r11, r0, r0') C complement...
-SM(` addic r11, r11, 1') C ...carry flag
- ld r10, 8(up)
- ld r8, 16(up)
- ld r7, 24(up)
- ld r27, 32(up)
- addi up, up, 32
- addi rp, rp, 32
- mulld r0, r10, v0
- mulhdu r10, r10, v0
- mulld r5, r8, v0
- mulhdu r8, r8, v0
- mulld r9, r7, v0
- mulhdu r7, r7, v0
- mulld r11, r27, v0
- mulhdu r27, r27, v0
- ld r28, -24(rp)
- adde r0, r0, r12
- ld r29, -16(rp)
- adde r5, r5, r10
- ld r30, -8(rp)
- ld r31, 0(rp)
- adde r9, r9, r8
- adde r11, r11, r7
- addze r12, r27
- ADDSUB r0, r0, r28
- std r0, -24(rp)
- ADDSUBC r5, r5, r29
-L(l3): std r5, -16(rp)
- ADDSUBC r9, r9, r30
-L(l2): std r9, -8(rp)
- ADDSUBC r11, r11, r31
-L(l1): std r11, 0(rp)
- bdnz L(top)
-
-AM(` addze r3, r12')
-SM(` subfe r11, r0, r0') C complement...
- ld r31, -8(r1)
-SM(` subf r3, r11, r12')
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h
deleted file mode 100644
index c7e2f894ad..0000000000
--- a/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 3500 MHz POWER6 (kolga.bibsys.no) */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
-#define USE_PREINV_DIVREM_1 0
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 21
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 50
-#define MUL_TOOM44_THRESHOLD 106
-#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 339
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 24
-#define SQR_TOOM3_THRESHOLD 49
-#define SQR_TOOM4_THRESHOLD 130
-#define SQR_TOOM6_THRESHOLD 226
-#define SQR_TOOM8_THRESHOLD 272
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 14
-
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
- { 33, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
- { 31,10}, { 71,11}, { 47,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
- { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \
- { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207,12}, { 63,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
- { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
- { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
- { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 79
-#define MUL_FFT_THRESHOLD 3520
-
-#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 280, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
- { 63,10}, { 47,11}, { 31,10}, { 71, 9}, \
- { 143,11}, { 47,12}, { 31,11}, { 63, 9}, \
- { 255, 8}, { 511, 9}, { 271,10}, { 143,11}, \
- { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \
- { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511, 8}, { 1023,10}, { 271, 9}, { 543,11}, \
- { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 80
-#define SQR_FFT_THRESHOLD 2752
-
-#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 2995
-
-#define DC_DIV_QR_THRESHOLD 59
-#define DC_DIVAPPR_Q_THRESHOLD 200
-#define DC_BDIV_QR_THRESHOLD 70
-#define DC_BDIV_Q_THRESHOLD 168
-
-#define INV_MULMOD_BNM1_THRESHOLD 53
-#define INV_NEWTON_THRESHOLD 170
-#define INV_APPR_THRESHOLD 166
-
-#define BINV_NEWTON_THRESHOLD 220
-#define REDC_1_TO_REDC_N_THRESHOLD 67
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 942
-#define MUPI_DIV_QR_THRESHOLD 57
-#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1078
-
-#define POWM_SEC_TABLE 4,26,216,804,1731
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 106
-#define HGCD_APPR_THRESHOLD 109
-#define HGCD_REDUCE_THRESHOLD 2205
-#define GCD_DC_THRESHOLD 492
-#define GCDEXT_DC_THRESHOLD 327
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 16
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 537
-#define SET_STR_PRECOMPUTE_THRESHOLD 1576
-
-#define FAC_DSC_THRESHOLD 426
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm b/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm
deleted file mode 100644
index 3d32b46c35..0000000000
--- a/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm
+++ /dev/null
@@ -1,589 +0,0 @@
-dnl PowerPC-64 mpn_mul_basecase.
-
-dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 ?
-C POWER6 12.25
-
-C TODO
-C * Reduce register usage. At least 4 register less can be used.
-C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
-C would bring us to 9 c/l.
-C * The bdz insns for b1 and b2 will never branch,
-C * Align things better, perhaps by moving things like pointer updates from
-C before to after loops.
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`un', `r5')
-define(`vp', `r6')
-define(`vn', `r7')
-
-define(`v0', `r25')
-define(`outer_rp', `r22')
-define(`outer_up', `r23')
-
-ASM_START()
-PROLOGUE(mpn_mul_basecase)
-
-C Special code for un <= 2, for efficiency of these important cases,
-C and since it simplifies the default code.
- cmpdi cr0, un, 2
- bgt cr0, L(un_gt2)
- cmpdi cr6, vn, 1
- ld r7, 0(vp)
- ld r5, 0(up)
- mulld r8, r5, r7 C weight 0
- mulhdu r9, r5, r7 C weight 1
- std r8, 0(rp)
- beq cr0, L(2x)
- std r9, 8(rp)
- blr
- ALIGN(16)
-L(2x): ld r0, 8(up)
- mulld r8, r0, r7 C weight 1
- mulhdu r10, r0, r7 C weight 2
- addc r9, r9, r8
- addze r10, r10
- bne cr6, L(2x2)
- std r9, 8(rp)
- std r10, 16(rp)
- blr
- ALIGN(16)
-L(2x2): ld r6, 8(vp)
- nop
- mulld r8, r5, r6 C weight 1
- mulhdu r11, r5, r6 C weight 2
- mulld r12, r0, r6 C weight 2
- mulhdu r0, r0, r6 C weight 3
- addc r9, r9, r8
- std r9, 8(rp)
- adde r11, r11, r10
- addze r0, r0
- addc r11, r11, r12
- addze r0, r0
- std r11, 16(rp)
- std r0, 24(rp)
- blr
-
-L(un_gt2):
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- std r26, -48(r1)
- std r25, -56(r1)
- std r24, -64(r1)
- std r23, -72(r1)
- std r22, -80(r1)
- std r21, -88(r1)
- std r20, -96(r1)
-
- mr outer_rp, rp
- mr outer_up, up
-
- ld v0, 0(vp) C new v limb
- addi vp, vp, 8
- ld r26, 0(up)
-
- rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi un, un, 4 C compute count...
- srdi un, un, 2 C ...for ctr
- mtctr un C copy inner loop count into ctr
- beq cr0, L(b0)
- blt cr6, L(b1)
- beq cr6, L(b2)
-
-
- ALIGN(16)
-L(b3):
- ld r27, 8(up)
- ld r20, 16(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r10, r20, v0
- addc r24, r24, r31
- adde r9, r9, r8
- addze r12, r10
- std r0, 0(rp)
- std r24, 8(rp)
- std r9, 16(rp)
- addi up, up, 16
- addi rp, rp, 16
- bdz L(end_m_3)
-
- ALIGN(32)
-L(lo_m_3):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up)
- ld r21, 32(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r27, r20, v0
- mulld r11, r21, v0
- mulhdu r26, r21, v0
- adde r0, r0, r12
- adde r24, r24, r31
- std r0, 8(rp)
- adde r9, r9, r8
- std r24, 16(rp)
- adde r11, r11, r27
- std r9, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- mr r12, r26
- bdnz L(lo_m_3)
-
- ALIGN(16)
-L(end_m_3):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- beq L(ret)
-
- ALIGN(16)
-L(outer_lo_3):
- mtctr un C copy inner loop count into ctr
- addi rp, outer_rp, 24
- addi up, outer_up, 16
- addi outer_rp, outer_rp, 8
- ld v0, 0(vp) C new v limb
- addi vp, vp, 8
- ld r26, -16(up)
- ld r27, -8(up)
- ld r20, 0(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r10, r20, v0
- ld r28, -16(rp)
- ld r29, -8(rp)
- ld r30, 0(rp)
- addc r24, r24, r31
- adde r9, r9, r8
- addze r12, r10
- addc r0, r0, r28
- std r0, -16(rp)
- adde r24, r24, r29
- std r24, -8(rp)
- adde r9, r9, r30
- std r9, 0(rp)
- bdz L(end_3)
-
- ALIGN(32) C registers dying
-L(lo_3):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up) C
- ld r21, 32(up) C
- addi up, up, 32 C
- addi rp, rp, 32 C
- mulld r0, r26, v0 C
- mulhdu r10, r26, v0 C 26
- mulld r24, r27, v0 C
- mulhdu r8, r27, v0 C 27
- mulld r9, r20, v0 C
- mulhdu r27, r20, v0 C 26
- mulld r11, r21, v0 C
- mulhdu r26, r21, v0 C 27
- ld r28, -24(rp) C
- adde r0, r0, r12 C 0 12
- ld r29, -16(rp) C
- adde r24, r24, r10 C 24 10
- ld r30, -8(rp) C
- ld r31, 0(rp) C
- adde r9, r9, r8 C 8 9
- adde r11, r11, r27 C 27 11
- addze r12, r26 C 26
- addc r0, r0, r28 C 0 28
- std r0, -24(rp) C 0
- adde r24, r24, r29 C 7 29
- std r24, -16(rp) C 7
- adde r9, r9, r30 C 9 30
- std r9, -8(rp) C 9
- adde r11, r11, r31 C 11 31
- std r11, 0(rp) C 11
- bdnz L(lo_3) C
-
- ALIGN(16)
-L(end_3):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- bne L(outer_lo_3)
- b L(ret)
-
-
- ALIGN(16)
-L(b1):
- mulld r0, r26, v0
- mulhdu r12, r26, v0
- addic r0, r0, 0
- std r0, 0(rp)
- bdz L(end_m_1)
-
- ALIGN(16)
-L(lo_m_1):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up)
- ld r21, 32(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r27, r20, v0
- mulld r11, r21, v0
- mulhdu r26, r21, v0
- adde r0, r0, r12
- adde r24, r24, r31
- std r0, 8(rp)
- adde r9, r9, r8
- std r24, 16(rp)
- adde r11, r11, r27
- std r9, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- mr r12, r26
- bdnz L(lo_m_1)
-
- ALIGN(16)
-L(end_m_1):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- beq L(ret)
-
- ALIGN(16)
-L(outer_lo_1):
- mtctr un C copy inner loop count into ctr
- addi rp, outer_rp, 8
- mr up, outer_up
- addi outer_rp, outer_rp, 8
- ld v0, 0(vp) C new v limb
- addi vp, vp, 8
- ld r26, 0(up)
- ld r28, 0(rp)
- mulld r0, r26, v0
- mulhdu r12, r26, v0
- addc r0, r0, r28
- std r0, 0(rp)
- bdz L(end_1)
-
- ALIGN(32) C registers dying
-L(lo_1):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up) C
- ld r21, 32(up) C
- addi up, up, 32 C
- addi rp, rp, 32 C
- mulld r0, r26, v0 C
- mulhdu r10, r26, v0 C 26
- mulld r24, r27, v0 C
- mulhdu r8, r27, v0 C 27
- mulld r9, r20, v0 C
- mulhdu r27, r20, v0 C 26
- mulld r11, r21, v0 C
- mulhdu r26, r21, v0 C 27
- ld r28, -24(rp) C
- adde r0, r0, r12 C 0 12
- ld r29, -16(rp) C
- adde r24, r24, r10 C 24 10
- ld r30, -8(rp) C
- ld r31, 0(rp) C
- adde r9, r9, r8 C 8 9
- adde r11, r11, r27 C 27 11
- addze r12, r26 C 26
- addc r0, r0, r28 C 0 28
- std r0, -24(rp) C 0
- adde r24, r24, r29 C 7 29
- std r24, -16(rp) C 7
- adde r9, r9, r30 C 9 30
- std r9, -8(rp) C 9
- adde r11, r11, r31 C 11 31
- std r11, 0(rp) C 11
- bdnz L(lo_1) C
-
- ALIGN(16)
-L(end_1):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- bne L(outer_lo_1)
- b L(ret)
-
-
- ALIGN(16)
-L(b0):
- addi up, up, -8
- addi rp, rp, -8
- li r12, 0
- addic r12, r12, 0
- bdz L(end_m_0)
-
- ALIGN(16)
-L(lo_m_0):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up)
- ld r21, 32(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r27, r20, v0
- mulld r11, r21, v0
- mulhdu r26, r21, v0
- adde r0, r0, r12
- adde r24, r24, r31
- std r0, 8(rp)
- adde r9, r9, r8
- std r24, 16(rp)
- adde r11, r11, r27
- std r9, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- mr r12, r26
- bdnz L(lo_m_0)
-
- ALIGN(16)
-L(end_m_0):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- beq L(ret)
-
- ALIGN(16)
-L(outer_lo_0):
- mtctr un C copy inner loop count into ctr
- addi rp, outer_rp, 0
- addi up, outer_up, -8
- addi outer_rp, outer_rp, 8
- ld v0, 0(vp) C new v limb
- addi vp, vp, 8
- li r12, 0
- addic r12, r12, 0
- bdz L(end_0)
-
- ALIGN(32) C registers dying
-L(lo_0):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up) C
- ld r21, 32(up) C
- addi up, up, 32 C
- addi rp, rp, 32 C
- mulld r0, r26, v0 C
- mulhdu r10, r26, v0 C 26
- mulld r24, r27, v0 C
- mulhdu r8, r27, v0 C 27
- mulld r9, r20, v0 C
- mulhdu r27, r20, v0 C 26
- mulld r11, r21, v0 C
- mulhdu r26, r21, v0 C 27
- ld r28, -24(rp) C
- adde r0, r0, r12 C 0 12
- ld r29, -16(rp) C
- adde r24, r24, r10 C 24 10
- ld r30, -8(rp) C
- ld r31, 0(rp) C
- adde r9, r9, r8 C 8 9
- adde r11, r11, r27 C 27 11
- addze r12, r26 C 26
- addc r0, r0, r28 C 0 28
- std r0, -24(rp) C 0
- adde r24, r24, r29 C 7 29
- std r24, -16(rp) C 7
- adde r9, r9, r30 C 9 30
- std r9, -8(rp) C 9
- adde r11, r11, r31 C 11 31
- std r11, 0(rp) C 11
- bdnz L(lo_0) C
-
- ALIGN(16)
-L(end_0):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- bne L(outer_lo_0)
- b L(ret)
-
-
- ALIGN(16)
-L(b2): ld r27, 8(up)
- addi up, up, 8
- mulld r0, r26, v0
- mulhdu r10, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- addc r24, r24, r10
- addze r12, r8
- std r0, 0(rp)
- std r24, 8(rp)
- addi rp, rp, 8
- bdz L(end_m_2)
-
- ALIGN(16)
-L(lo_m_2):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up)
- ld r21, 32(up)
- mulld r0, r26, v0
- mulhdu r31, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- mulld r9, r20, v0
- mulhdu r27, r20, v0
- mulld r11, r21, v0
- mulhdu r26, r21, v0
- adde r0, r0, r12
- adde r24, r24, r31
- std r0, 8(rp)
- adde r9, r9, r8
- std r24, 16(rp)
- adde r11, r11, r27
- std r9, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- mr r12, r26
- bdnz L(lo_m_2)
-
- ALIGN(16)
-L(end_m_2):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- beq L(ret)
-
- ALIGN(16)
-L(outer_lo_2):
- mtctr un C copy inner loop count into ctr
- addi rp, outer_rp, 16
- addi up, outer_up, 8
- addi outer_rp, outer_rp, 8
- ld v0, 0(vp) C new v limb
- addi vp, vp, 8
- ld r26, -8(up)
- ld r27, 0(up)
- ld r28, -8(rp)
- ld r29, 0(rp)
- mulld r0, r26, v0
- mulhdu r10, r26, v0
- mulld r24, r27, v0
- mulhdu r8, r27, v0
- addc r24, r24, r10
- addze r12, r8
- addc r0, r0, r28
- std r0, -8(rp)
- adde r24, r24, r29
- std r24, 0(rp)
- bdz L(end_2)
-
- ALIGN(16) C registers dying
-L(lo_2):
- ld r26, 8(up)
- ld r27, 16(up)
- ld r20, 24(up) C
- ld r21, 32(up) C
- addi up, up, 32 C
- addi rp, rp, 32 C
- mulld r0, r26, v0 C
- mulhdu r10, r26, v0 C 26
- mulld r24, r27, v0 C
- mulhdu r8, r27, v0 C 27
- mulld r9, r20, v0 C
- mulhdu r27, r20, v0 C 26
- mulld r11, r21, v0 C
- mulhdu r26, r21, v0 C 27
- ld r28, -24(rp) C
- adde r0, r0, r12 C 0 12
- ld r29, -16(rp) C
- adde r24, r24, r10 C 24 10
- ld r30, -8(rp) C
- ld r31, 0(rp) C
- adde r9, r9, r8 C 8 9
- adde r11, r11, r27 C 27 11
- addze r12, r26 C 26
- addc r0, r0, r28 C 0 28
- std r0, -24(rp) C 0
- adde r24, r24, r29 C 7 29
- std r24, -16(rp) C 7
- adde r9, r9, r30 C 9 30
- std r9, -8(rp) C 9
- adde r11, r11, r31 C 11 31
- std r11, 0(rp) C 11
- bdnz L(lo_2) C
-
- ALIGN(16)
-L(end_2):
- addze r12, r12
- addic. vn, vn, -1
- std r12, 8(rp)
- bne L(outer_lo_2)
-C b L(ret)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- ld r26, -48(r1)
- ld r25, -56(r1)
- ld r24, -64(r1)
- ld r23, -72(r1)
- ld r22, -80(r1)
- ld r21, -88(r1)
- ld r20, -96(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm b/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm
deleted file mode 100644
index 8731e01a89..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-dnl PowerPC-64 mpn_mul_2 and mpn_addmul_2.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C mul_2 addmul_2
-C POWER3/PPC630 ? ?
-C POWER4/PPC970 ? ?
-C POWER5 ? ?
-C POWER6 ? ?
-C POWER7-SMT4 3 3
-C POWER7-SMT2 ? ?
-C POWER7-SMT1 ? ?
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`vp', `r6')
-
-define(`cy0', `r10')
-ifdef(`EXTRA_REGISTER',
-` define(`cy1', EXTRA_REGISTER)',
-` define(`cy1', `r31')')
-
-ifdef(`OPERATION_mul_2',`
- define(`AM', `')
- define(`ADDX', `addc')
- define(`func', `mpn_mul_2')
-')
-ifdef(`OPERATION_addmul_2',`
- define(`AM', `$1')
- define(`ADDX', `adde')
- define(`func', `mpn_addmul_2')
-')
-
-MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
-
-ASM_START()
-PROLOGUE(func)
-
-ifdef(`EXTRA_REGISTER',,`
- std r31, -8(r1)
-')
- andi. r12, n, 1
- addi r0, n, 1
- srdi r0, r0, 1
- mtctr r0
- ld r11, 0(vp) C v0
- li cy0, 0
- ld r12, 8(vp) C v1
- li cy1, 0
- ld r5, 0(up)
- beq L(lo0)
- addi up, up, -8
- addi rp, rp, -8
- b L(lo1)
-
- ALIGN(32)
-L(top):
-AM(` ld r0, -8(rp)')
- ld r5, 0(up)
-AM(` addc r6, r6, r0')
- ADDX r7, r7, r8
- addze r9, r9
- addc r6, r6, cy0
- adde cy0, r7, cy1
- std r6, -8(rp)
- addze cy1, r9
-L(lo0): mulld r6, r11, r5 C v0 * u[i] weight 0
- mulhdu r7, r11, r5 C v0 * u[i] weight 1
- mulld r8, r12, r5 C v1 * u[i] weight 1
- mulhdu r9, r12, r5 C v1 * u[i] weight 2
-AM(` ld r0, 0(rp)')
- ld r5, 8(up)
-AM(` addc r6, r6, r0')
- ADDX r7, r7, r8
- addze r9, r9
- addc r6, r6, cy0
- adde cy0, r7, cy1
- std r6, 0(rp)
- addze cy1, r9
-L(lo1): mulld r6, r11, r5 C v0 * u[i] weight 0
- mulhdu r7, r11, r5 C v0 * u[i] weight 1
- addi up, up, 16
- addi rp, rp, 16
- mulld r8, r12, r5 C v1 * u[i] weight 1
- mulhdu r9, r12, r5 C v1 * u[i] weight 2
- bdnz L(top)
-
-L(end):
-AM(` ld r0, -8(rp)')
-AM(` addc r6, r6, r0')
- ADDX r7, r7, r8
- addze r9, r9
- addc r6, r6, cy0
- std r6, -8(rp)
- adde cy0, r7, cy1
- addze cy1, r9
- std cy0, 0(rp)
- mr r3, cy1
-
-ifdef(`EXTRA_REGISTER',,`
- ld r31, -8(r1)
-')
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aors_n.asm b/gmp/mpn/powerpc64/mode64/p7/aors_n.asm
deleted file mode 100644
index 857c701dec..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/aors_n.asm
+++ /dev/null
@@ -1,128 +0,0 @@
-dnl PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 ?
-C POWER6 ?
-C POWER7 2.18
-
-C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an
-C anomaly.
-
-ifdef(`OPERATION_add_n',`
- define(ADDSUBC, adde)
- define(ADDSUB, addc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)
- define(GENRVAL, `addi r3, r3, 1')
- define(SETCBR, `addic r0, $1, -1')
- define(CLRCB, `addic r0, r0, 0')
-')
-ifdef(`OPERATION_sub_n',`
- define(ADDSUBC, subfe)
- define(ADDSUB, subfc)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)
- define(GENRVAL, `neg r3, r3')
- define(SETCBR, `subfic r0, $1, 0')
- define(CLRCB, `addic r0, r1, -1')
-')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`vp', `r5')
-define(`n', `r6')
-
-ASM_START()
-PROLOGUE(func_nc)
- SETCBR(r7)
- b L(ent)
-EPILOGUE()
-
-PROLOGUE(func)
- CLRCB
-L(ent):
- andi. r7, n, 1
- beq L(bx0)
-
-L(bx1): ld r7, 0(up)
- ld r9, 0(vp)
- ADDSUBC r11, r9, r7
- std r11, 0(rp)
- cmpldi cr6, n, 1
- beq cr6, L(end)
- addi up, up, 8
- addi vp, vp, 8
- addi rp, rp, 8
-
-L(bx0): addi r0, n, 2 C compute branch...
- srdi r0, r0, 2 C ...count
- mtctr r0
-
- andi. r7, n, 2
- bne L(mid)
-
- addi up, up, 16
- addi vp, vp, 16
- addi rp, rp, 16
-
- ALIGN(32)
-L(top): ld r6, -16(up)
- ld r7, -8(up)
- ld r8, -16(vp)
- ld r9, -8(vp)
- ADDSUBC r10, r8, r6
- ADDSUBC r11, r9, r7
- std r10, -16(rp)
- std r11, -8(rp)
-L(mid): ld r6, 0(up)
- ld r7, 8(up)
- ld r8, 0(vp)
- ld r9, 8(vp)
- ADDSUBC r10, r8, r6
- ADDSUBC r11, r9, r7
- std r10, 0(rp)
- std r11, 8(rp)
- addi up, up, 32
- addi vp, vp, 32
- addi rp, rp, 32
- bdnz L(top)
-
-L(end): subfe r3, r0, r0 C -cy
- GENRVAL
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
deleted file mode 100644
index ddf5fd84b1..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
-
-include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
deleted file mode 100644
index 3f9d88d6ca..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
-ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
-ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
-
-include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
deleted file mode 100644
index 525120262f..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
+++ /dev/null
@@ -1,129 +0,0 @@
-dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
-
-dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 ?
-C POWER6 ?
-C POWER7 2.5
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`vp', `r5')
-define(`n', `r6')
-
-ifdef(`DO_add', `
- define(`ADDSUBC', `addc $1, $2, $3')
- define(`ADDSUBE', `adde $1, $2, $3')
- define(INITCY, `addic $1, r1, 0')
- define(RETVAL, `addze r3, $1')
- define(`func', mpn_addlsh`'LSH`'_n)')
-ifdef(`DO_sub', `
- define(`ADDSUBC', `subfc $1, $2, $3')
- define(`ADDSUBE', `subfe $1, $2, $3')
- define(INITCY, `addic $1, r1, -1')
- define(RETVAL, `subfze r3, $1
- neg r3, r3')
- define(`func', mpn_sublsh`'LSH`'_n)')
-ifdef(`DO_rsb', `
- define(`ADDSUBC', `subfc $1, $3, $2')
- define(`ADDSUBE', `subfe $1, $3, $2')
- define(INITCY, `addic $1, r1, -1')
- define(RETVAL, `addme r3, $1')
- define(`func', mpn_rsblsh`'LSH`'_n)')
-
-define(`s0', `r0') define(`s1', `r9')
-define(`u0', `r6') define(`u1', `r7')
-define(`v0', `r10') define(`v1', `r11')
-
-
-ASM_START()
-PROLOGUE(func)
- rldic r7, n, 3, 59
- add up, up, r7
- add vp, vp, r7
- add rp, rp, r7
-
-ifdef(`DO_add', `
- addic r0, n, 3 C set cy flag as side effect
-',`
- subfc r0, r0, r0 C set cy flag
- addi r0, n, 3
-')
- srdi r0, r0, 2
- mtctr r0
-
- andi. r0, n, 1
- beq L(bx0)
-
-L(bx1): andi. r0, n, 2
- li s0, 0
- bne L(lo3)
- b L(lo1)
-
-L(bx0): andi. r0, n, 2
- li s1, 0
- bne L(lo2)
-
- ALIGN(32)
-L(top): addi rp, rp, 32
- ld v0, 0(vp)
- addi vp, vp, 32
- rldimi s1, v0, LSH, 0
- ld u0, 0(up)
- addi up, up, 32
- srdi s0, v0, RSH
- ADDSUBE(s1, s1, u0)
- std s1, -32(rp)
-L(lo3): ld v1, -24(vp)
- rldimi s0, v1, LSH, 0
- ld u1, -24(up)
- srdi s1, v1, RSH
- ADDSUBE(s0, s0, u1)
- std s0, -24(rp)
-L(lo2): ld v0, -16(vp)
- rldimi s1, v0, LSH, 0
- ld u0, -16(up)
- srdi s0, v0, RSH
- ADDSUBE(s1, s1, u0)
- std s1, -16(rp)
-L(lo1): ld v1, -8(vp)
- rldimi s0, v1, LSH, 0
- ld u1, -8(up)
- srdi s1, v1, RSH
- ADDSUBE(s0, s0, u1)
- std s0, -8(rp)
- bdnz L(top) C decrement CTR and loop back
-
- RETVAL( s1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm b/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm
deleted file mode 100644
index 47cb40bdc5..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/gcd_1.asm
+++ /dev/null
@@ -1,110 +0,0 @@
-dnl PowerPC-64 mpn_gcd_1.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/bit (approx)
-C POWER3/PPC630 -
-C POWER4/PPC970 -
-C POWER5 -
-C POWER6 -
-C POWER7 7.6
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up', `r3')
-define(`n', `r4')
-define(`v0', `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
- mflr r0
- std r30, -16(r1)
- std r31, -8(r1)
- std r0, 16(r1)
- stdu r1, -128(r1)
-
- ld r7, 0(up) C U low limb
- or r0, r5, r7 C x | y
-
- neg r6, r0
- and r6, r6, r0
- cntlzd r31, r6 C common twos
- subfic r31, r31, 63
-
- neg r6, r5
- and r6, r6, r5
- cntlzd r8, r6
- subfic r8, r8, 63
- srd r5, r5, r8
- mr r30, r5 C v0 saved
-
- cmpdi r4, BMOD_1_TO_MOD_1_THRESHOLD
- blt L(bmod)
- CALL( mpn_mod_1)
- b L(reduced)
-L(bmod):
- li r6, 0
- CALL( mpn_modexact_1c_odd)
-L(reduced):
-
-define(`cnt', `r9')dnl
-
- neg. r6, r3
- and r6, r6, r3
- cntlzd cnt, r6
- li r12, 63
- bne L(mid)
- b L(end)
-
- ALIGN(16)
-L(top): isel r30, r3, r30, 29 C y = min(x,y)
- isel r3, r10, r11, 29 C x = |y - x|
-L(mid): subf cnt, cnt, r12 C cnt = 63-cnt
- srd r3, r3, cnt
- subf r10, r3, r30 C r10 = y - x
- subf r11, r30, r3 C r11 = x - y
- cmpld cr7, r30, r3
- and r8, r11, r10 C isolate lsb
- cntlzd cnt, r8
- bne cr7, L(top)
-
-L(end): sld r3, r30, r31
-
- addi r1, r1, 128
- ld r0, 16(r1)
- ld r30, -16(r1)
- ld r31, -8(r1)
- mtlr r0
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h b/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h
deleted file mode 100644
index 7e719e8aac..0000000000
--- a/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2003, 2009-2011, 2013, 2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 3700 MHz POWER7/SMT4 (gcc111.fsffrance.org) */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.8 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
-#define USE_PREINV_DIVREM_1 0
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 28
-
-#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 72
-#define MUL_TOOM44_THRESHOLD 200
-#define MUL_TOOM6H_THRESHOLD 298
-#define MUL_TOOM8H_THRESHOLD 406
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 36
-#define SQR_TOOM3_THRESHOLD 109
-#define SQR_TOOM4_THRESHOLD 196
-#define SQR_TOOM6_THRESHOLD 414
-#define SQR_TOOM8_THRESHOLD 547
-
-#define MULMID_TOOM42_THRESHOLD 58
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 20
-
-#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 28, 8}, { 15, 7}, { 33, 8}, \
- { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
- { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
- { 159,11}, { 95,10}, { 191, 9}, { 383,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 383, 9}, { 767,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 575,11}, { 303,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
- { 671,11}, { 351,10}, { 703, 9}, { 1407,11}, \
- { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
- { 223,11}, { 447,10}, { 895,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,11}, { 703,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \
- { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
- { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
- { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \
- { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
- { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
- { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \
- { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
- { 447,12}, { 895,11}, { 1791,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \
- { 2175,13}, { 575,12}, { 1215,11}, { 2431,13}, \
- { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \
- { 1407,11}, { 2815,14}, { 383,13}, { 767,12}, \
- { 1599,13}, { 831,12}, { 1663,13}, { 895,12}, \
- { 1791,13}, { 959,12}, { 1919,11}, { 3839,14}, \
- { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
- { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
- { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
- { 2687,13}, { 1407,12}, { 2815,13}, { 1471,12}, \
- { 2943,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
- { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
- { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
- { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
- { 3455,12}, { 6911,14}, { 1919,13}, { 3839,16}, \
- { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
- { 2431,13}, { 4863,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 231
-#define MUL_FFT_THRESHOLD 4288
-
-#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 368, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
- { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
- { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
- { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \
- { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \
- { 639,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 271,10}, { 543, 9}, { 1087,11}, \
- { 287,10}, { 575, 9}, { 1151,11}, { 303,10}, \
- { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
- { 479,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
- { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
- { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
- { 383,11}, { 767,12}, { 415,11}, { 831,10}, \
- { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
- { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
- { 703,11}, { 1407,10}, { 2815,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
- { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
- { 1791,12}, { 959,11}, { 1919,10}, { 3839,14}, \
- { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
- { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \
- { 767,12}, { 1599,13}, { 831,12}, { 1663,13}, \
- { 895,12}, { 1791,13}, { 959,12}, { 1919,14}, \
- { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
- { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \
- { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \
- { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \
- { 1663,12}, { 3327,13}, { 1727,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
- { 1663,13}, { 3455,14}, { 1791,13}, { 3583,14}, \
- { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
- { 2175,13}, { 4479,14}, { 2303,13}, { 4607,14}, \
- { 2431,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 230
-#define SQR_FFT_THRESHOLD 3264
-
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 34
-#define MULLO_MUL_N_THRESHOLD 9174
-
-#define DC_DIV_QR_THRESHOLD 33
-#define DC_DIVAPPR_Q_THRESHOLD 126
-#define DC_BDIV_QR_THRESHOLD 63
-#define DC_BDIV_Q_THRESHOLD 152
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 155
-#define INV_APPR_THRESHOLD 125
-
-#define BINV_NEWTON_THRESHOLD 294
-#define REDC_1_TO_REDC_2_THRESHOLD 17
-#define REDC_2_TO_REDC_N_THRESHOLD 115
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1334
-#define MUPI_DIV_QR_THRESHOLD 54
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,14,62,642,960
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 126
-#define HGCD_APPR_THRESHOLD 184
-#define HGCD_REDUCE_THRESHOLD 3014
-#define GCD_DC_THRESHOLD 440
-#define GCDEXT_DC_THRESHOLD 386
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 17
-#define SET_STR_DC_THRESHOLD 1655
-#define SET_STR_PRECOMPUTE_THRESHOLD 3417
-
-#define FAC_DSC_THRESHOLD 1138
-#define FAC_ODD_THRESHOLD 27
diff --git a/gmp/mpn/powerpc64/mode64/rsh1add_n.asm b/gmp/mpn/powerpc64/mode64/rsh1add_n.asm
new file mode 100644
index 0000000000..0cd6cf4e8c
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/rsh1add_n.asm
@@ -0,0 +1,104 @@
+dnl PowerPC-64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER4/PPC970: 4 (2.0 c/l should be possible)
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`x',`r0')
+define(`u0',`r8')
+define(`u1',`r9')
+define(`v0',`r10')
+define(`v1',`r11')
+
+
+ASM_START()
+PROLOGUE(mpn_rsh1add_n)
+ mtctr r6 C copy size to count register
+ addi rp, rp, -8
+
+ ld u1, 0(up)
+ ld v1, 0(vp)
+ addc x, v1, u1
+ rldicl r12, x, 0, 63 C return value
+ srdi s1, x, 1
+
+ bdz L(1)
+
+ ld u0, 8(up)
+ ld v0, 8(vp)
+
+ bdz L(end)
+
+L(oop): ldu u1, 16(up)
+ ldu v1, 16(vp)
+ adde x, v0, u0
+ srdi s0, x, 1
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+
+ bdz L(exit)
+
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ adde x, v1, u1
+ srdi s1, x, 1
+ rldimi s0, x, 63, 0
+ stdu s0, 16(rp)
+
+ bdnz L(oop)
+
+L(end): adde x, v0, u0
+ srdi s0, x, 1
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+
+ li x, 0
+ addze x, x
+ rldimi s0, x, 63, 0
+ std s0, 16(rp)
+ mr r3, r12
+ blr
+
+L(exit): adde x, v1, u1
+ srdi s1, x, 1
+ rldimi s0, x, 63, 0
+ stdu s0, 16(rp)
+
+L(1): li x, 0
+ addze x, x
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+ mr r3, r12
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm b/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm
deleted file mode 100644
index 7f7734bcef..0000000000
--- a/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm
+++ /dev/null
@@ -1,172 +0,0 @@
-dnl PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n
-
-dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 2.9
-C POWER5 ?
-C POWER6 3.5
-C POWER7 2.25
-
-define(`rp', `r3')
-define(`up', `r4')
-define(`vp', `r5')
-define(`n', `r6')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(`ADDSUBC', `addc')
- define(`ADDSUBE', `adde')
- define(INITCY, `addic $1, r1, 0')
- define(`func', mpn_rsh1add_n)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(`ADDSUBC', `subfc')
- define(`ADDSUBE', `subfe')
- define(INITCY, `addic $1, r1, -1')
- define(`func', mpn_rsh1sub_n)')
-
-define(`s0', `r9')
-define(`s1', `r7')
-define(`x0', `r0')
-define(`x1', `r12')
-define(`u0', `r8')
-define(`v0', `r10')
-
-
-ASM_START()
-PROLOGUE(func)
- ld u0, 0(up)
- ld v0, 0(vp)
-
- cmpdi cr6, n, 2
-
- addi r0, n, 1
- srdi r0, r0, 2
- mtctr r0 C copy size to count register
-
- andi. r0, n, 1
- bne cr0, L(bx1)
-
-L(bx0): ADDSUBC x1, v0, u0
- ld u0, 8(up)
- ld v0, 8(vp)
- ADDSUBE x0, v0, u0
- ble cr6, L(n2)
- ld u0, 16(up)
- ld v0, 16(vp)
- srdi s0, x1, 1
- rldicl r11, x1, 0, 63 C return value
- ADDSUBE x1, v0, u0
- andi. n, n, 2
- bne cr0, L(b10)
-L(b00): addi rp, rp, -24
- b L(lo0)
-L(b10): addi up, up, 16
- addi vp, vp, 16
- addi rp, rp, -8
- b L(lo2)
-
- ALIGN(16)
-L(bx1): ADDSUBC x0, v0, u0
- ble cr6, L(n1)
- ld u0, 8(up)
- ld v0, 8(vp)
- ADDSUBE x1, v0, u0
- ld u0, 16(up)
- ld v0, 16(vp)
- srdi s1, x0, 1
- rldicl r11, x0, 0, 63 C return value
- ADDSUBE x0, v0, u0
- andi. n, n, 2
- bne cr0, L(b11)
-L(b01): addi up, up, 8
- addi vp, vp, 8
- addi rp, rp, -16
- b L(lo1)
-L(b11): addi up, up, 24
- addi vp, vp, 24
- bdz L(end)
-
- ALIGN(32)
-L(top): ld u0, 0(up)
- ld v0, 0(vp)
- srdi s0, x1, 1
- rldimi s1, x1, 63, 0
- std s1, 0(rp)
- ADDSUBE x1, v0, u0
-L(lo2): ld u0, 8(up)
- ld v0, 8(vp)
- srdi s1, x0, 1
- rldimi s0, x0, 63, 0
- std s0, 8(rp)
- ADDSUBE x0, v0, u0
-L(lo1): ld u0, 16(up)
- ld v0, 16(vp)
- srdi s0, x1, 1
- rldimi s1, x1, 63, 0
- std s1, 16(rp)
- ADDSUBE x1, v0, u0
-L(lo0): ld u0, 24(up)
- ld v0, 24(vp)
- srdi s1, x0, 1
- rldimi s0, x0, 63, 0
- std s0, 24(rp)
- ADDSUBE x0, v0, u0
- addi up, up, 32
- addi vp, vp, 32
- addi rp, rp, 32
- bdnz L(top)
-
-L(end): srdi s0, x1, 1
- rldimi s1, x1, 63, 0
- std s1, 0(rp)
-L(cj2): srdi s1, x0, 1
- rldimi s0, x0, 63, 0
- std s0, 8(rp)
-L(cj1): ADDSUBE x1, x1, x1 C pseudo-depends on x1
- rldimi s1, x1, 63, 0
- std s1, 16(rp)
- mr r3, r11
- blr
-
-L(n1): srdi s1, x0, 1
- rldicl r11, x0, 0, 63 C return value
- ADDSUBE x1, x1, x1 C pseudo-depends on x1
- rldimi s1, x1, 63, 0
- std s1, 0(rp)
- mr r3, r11
- blr
-
-L(n2): addi rp, rp, -8
- srdi s0, x1, 1
- rldicl r11, x1, 0, 63 C return value
- b L(cj2)
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/rsh1sub_n.asm b/gmp/mpn/powerpc64/mode64/rsh1sub_n.asm
new file mode 100644
index 0000000000..e4c78ff2b5
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/rsh1sub_n.asm
@@ -0,0 +1,102 @@
+dnl PowerPC-64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER4/PPC970: 4 (2.0 c/l should be possible)
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`x',`r0')
+define(`u0',`r8')
+define(`u1',`r9')
+define(`v0',`r10')
+define(`v1',`r11')
+
+
+ASM_START()
+PROLOGUE(mpn_rsh1sub_n)
+ mtctr r6 C copy size to count register
+ addi rp, rp, -8
+
+ ld u1, 0(up)
+ ld v1, 0(vp)
+ subfc x, v1, u1
+ rldicl r12, x, 0, 63 C return value
+ srdi s1, x, 1
+
+ bdz L(1)
+
+ ld u0, 8(up)
+ ld v0, 8(vp)
+
+ bdz L(end)
+
+L(oop): ldu u1, 16(up)
+ ldu v1, 16(vp)
+ subfe x, v0, u0
+ srdi s0, x, 1
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+
+ bdz L(exit)
+
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ subfe x, v1, u1
+ srdi s1, x, 1
+ rldimi s0, x, 63, 0
+ stdu s0, 16(rp)
+
+ bdnz L(oop)
+
+L(end): subfe x, v0, u0
+ srdi s0, x, 1
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+
+ subfe x, x, x
+ rldimi s0, x, 63, 0
+ std s0, 16(rp)
+ mr r3, r12
+ blr
+
+L(exit): subfe x, v1, u1
+ srdi s1, x, 1
+ rldimi s0, x, 63, 0
+ stdu s0, 16(rp)
+
+L(1): subfe x, x, x
+ rldimi s1, x, 63, 0
+ std s1, 8(rp)
+ mr r3, r12
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp/mpn/powerpc64/mode64/sqr_basecase.asm
deleted file mode 100644
index e76bb8878d..0000000000
--- a/gmp/mpn/powerpc64/mode64/sqr_basecase.asm
+++ /dev/null
@@ -1,863 +0,0 @@
-dnl PowerPC-64 mpn_sqr_basecase.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8
-C POWER5 8
-C POWER6 16.25
-C POWER7 3.77
-
-C NOTES
-C * This is very crude, cleanup!
-C * Try to reduce the number of needed live registers.
-C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
-C cost will be more live registers.
-C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
-C size a lot and speed things up perhaps 25%.
-C * Use computed goto in order to compress the code.
-C * Implement a larger final corner.
-C * Schedule callee-saves register saves into other insns. This could save
-C about 5 cycles/call. (We cannot analogously optimise the restores, since
-C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
-C * Should the alternating std/adde sequences be split? Some pipelines handle
-C adde poorly, and might sequentialise all these instructions.
-C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
-C adjacent integer multiply insns. Except for the multiply insns, the code
-C was not carefully optimised for POWER6 or any other CPU.
-C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-
-define(`rp_outer', `r25')
-define(`up_outer', `r21')
-define(`rp_saved', `r22')
-define(`up_saved', `r23')
-define(`n_saved', `r24')
-
-ASM_START()
-PROLOGUE(mpn_sqr_basecase)
- cmpdi cr0, n, 2
- bge cr0, L(ge2)
- ld r5, 0(up) C n = 1
- nop
- mulld r8, r5, r5 C weight 0
- mulhdu r9, r5, r5 C weight 1
- std r8, 0(rp)
- std r9, 8(rp)
- blr
- ALIGN(16)
-L(ge2): bgt cr0, L(gt2)
- ld r0, 0(up) C n = 2
- nop
- mulld r8, r0, r0 C u0 * u0
- mulhdu r9, r0, r0 C u0 * u0
- ld r6, 8(up)
- mulld r10, r6, r6 C u1 * u1
- mulhdu r11, r6, r6 C u1 * u1
- mulld r4, r6, r0 C u1 * u0
- mulhdu r5, r6, r0 C u1 * u0
- addc r4, r4, r4
- adde r5, r5, r5
- addze r11, r11
- addc r9, r9, r4
- adde r10, r10, r5
- addze r11, r11
- std r8, 0(rp)
- std r9, 8(rp)
- std r10, 16(rp)
- std r11, 24(rp)
- blr
-
- ALIGN(16)
-L(gt2): std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- std r26, -48(r1)
- std r25, -56(r1)
- std r24, -64(r1)
- std r23, -72(r1)
- std r22, -80(r1)
- std r21, -88(r1)
-
- mr rp_saved, rp
- mr up_saved, up
- mr n_saved, n
- mr rp_outer, rp
- mr up_outer, up
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addic r7, n, 2 C compute count...
- srdi r7, r7, 2 C ...for ctr
- mtctr r7 C copy count into ctr
- beq- cr0, L(b0)
- blt- cr6, L(b1)
- beq- cr6, L(b2)
-
-L(b3): ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- addi up, up, 24
- li r12, 0 C carry limb
- bdz L(em3)
-
- ALIGN(16)
-L(tm3): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 0(up)
- ld r27, 8(up)
- adde r0, r0, r12
- adde r7, r7, r26
- mulld r26, r9, r6
- mulhdu r10, r9, r6
- mulld r11, r27, r6
- mulhdu r12, r27, r6
- ld r9, 16(up)
- ld r27, 24(up)
- std r0, 8(rp)
- adde r26, r26, r8
- std r7, 16(rp)
- adde r11, r11, r10
- std r26, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- bdnz L(tm3)
-
-L(em3): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- adde r0, r0, r12
- adde r7, r7, r26
- std r0, 8(rp)
- std r7, 16(rp)
- addze r8, r8
- std r8, 24(rp)
- addi n, n, 2
- b L(outer_loop)
-
-L(b0): ld r6, 0(up)
- ld r27, 8(up)
- mulld r7, r27, r6
- mulhdu r12, r27, r6
- std r7, 8(rp)
- addi rp, rp, 8
- ld r9, 16(up)
- ld r27, 24(up)
- addi up, up, 32
- bdz L(em0)
-
- ALIGN(16)
-L(tm0): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 0(up)
- ld r27, 8(up)
- adde r0, r0, r12
- adde r7, r7, r26
- mulld r26, r9, r6
- mulhdu r10, r9, r6
- mulld r11, r27, r6
- mulhdu r12, r27, r6
- ld r9, 16(up)
- ld r27, 24(up)
- std r0, 8(rp)
- adde r26, r26, r8
- std r7, 16(rp)
- adde r11, r11, r10
- std r26, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- bdnz L(tm0)
-
-L(em0): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- adde r0, r0, r12
- adde r7, r7, r26
- std r0, 8(rp)
- std r7, 16(rp)
- addze r8, r8
- std r8, 24(rp)
- addi n, n, 2
- b L(outer_loop_ent_2)
-
-L(b1): ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r12, r27, r6
- addc r7, r7, r26
- std r0, 8(rp)
- std r7, 16(rp)
- addi rp, rp, 16
- ld r9, 24(up)
- ld r27, 32(up)
- addi up, up, 40
- bdz L(em1)
-
- ALIGN(16)
-L(tm1): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 0(up)
- ld r27, 8(up)
- adde r0, r0, r12
- adde r7, r7, r26
- mulld r26, r9, r6
- mulhdu r10, r9, r6
- mulld r11, r27, r6
- mulhdu r12, r27, r6
- ld r9, 16(up)
- ld r27, 24(up)
- std r0, 8(rp)
- adde r26, r26, r8
- std r7, 16(rp)
- adde r11, r11, r10
- std r26, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- bdnz L(tm1)
-
-L(em1): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- adde r0, r0, r12
- adde r7, r7, r26
- std r0, 8(rp)
- std r7, 16(rp)
- addze r8, r8
- std r8, 24(rp)
- addi n, n, 2
- b L(outer_loop_ent_3)
-
-L(b2): addi r7, r7, -1 C FIXME
- mtctr r7 C FIXME
- ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 24(up)
- mulld r11, r9, r6
- mulhdu r10, r9, r6
- addc r7, r7, r26
- adde r11, r11, r8
- addze r12, r10
- std r0, 8(rp)
- std r7, 16(rp)
- std r11, 24(rp)
- addi rp, rp, 24
- ld r9, 32(up)
- ld r27, 40(up)
- addi up, up, 48
- bdz L(em2)
-
- ALIGN(16)
-L(tm2): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 0(up)
- ld r27, 8(up)
- adde r0, r0, r12
- adde r7, r7, r26
- mulld r26, r9, r6
- mulhdu r10, r9, r6
- mulld r11, r27, r6
- mulhdu r12, r27, r6
- ld r9, 16(up)
- ld r27, 24(up)
- std r0, 8(rp)
- adde r26, r26, r8
- std r7, 16(rp)
- adde r11, r11, r10
- std r26, 24(rp)
- addi up, up, 32
- std r11, 32(rp)
- addi rp, rp, 32
- bdnz L(tm2)
-
-L(em2): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- adde r0, r0, r12
- adde r7, r7, r26
- std r0, 8(rp)
- std r7, 16(rp)
- addze r8, r8
- std r8, 24(rp)
- addi n, n, 2
- b L(outer_loop_ent_0)
-
-
-L(outer_loop):
- addi n, n, -1
- addi up_outer, up_outer, 8
- addi rp_outer, rp_outer, 16
-
- mr up, up_outer
- addi rp, rp_outer, 8
-
- srdi r0, n, 2
- mtctr r0
-
- bdz L(outer_end)
-
- ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r9, 24(up)
- ld r28, 0(rp)
- ld r29, 8(rp)
- ld r30, 16(rp)
- mulld r11, r9, r6
- mulhdu r10, r9, r6
- addc r7, r7, r26
- adde r11, r11, r8
- addze r12, r10
- addc r0, r0, r28
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- adde r11, r11, r30
- std r11, 16(rp)
- addi rp, rp, 24
- ld r9, 32(up)
- ld r27, 40(up)
- addi up, up, 48
- bdz L(ea1)
-
- ALIGN(16)
-L(ta1): mulld r0, r9, r6
- mulhdu r26, r9, r6 C 9
- mulld r7, r27, r6
- mulhdu r8, r27, r6 C 27
- ld r9, 0(up)
- ld r28, 0(rp)
- ld r27, 8(up)
- ld r29, 8(rp)
- adde r0, r0, r12 C 0 12
- adde r7, r7, r26 C 5 7
- mulld r26, r9, r6
- mulhdu r10, r9, r6 C 9
- mulld r11, r27, r6
- mulhdu r12, r27, r6 C 27
- ld r9, 16(up)
- ld r30, 16(rp)
- ld r27, 24(up)
- ld r31, 24(rp)
- adde r26, r26, r8 C 8 5
- adde r11, r11, r10 C 10 11
- addze r12, r12 C 12
- addc r0, r0, r28 C 0 28
- std r0, 0(rp) C 0
- adde r7, r7, r29 C 7 29
- std r7, 8(rp) C 7
- adde r26, r26, r30 C 5 30
- std r26, 16(rp) C 5
- adde r11, r11, r31 C 11 31
- std r11, 24(rp) C 11
- addi up, up, 32
- addi rp, rp, 32
- bdnz L(ta1)
-
-L(ea1): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r28, 0(rp)
- ld r29, 8(rp)
- adde r0, r0, r12
- adde r7, r7, r26
- addze r8, r8
- addc r0, r0, r28
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- addze r8, r8
- std r8, 16(rp)
-
-L(outer_loop_ent_0):
- addi n, n, -1
- addi up_outer, up_outer, 8
- addi rp_outer, rp_outer, 16
-
- mr up, up_outer
- addi rp, rp_outer, 8
-
- srdi r0, n, 2
- mtctr r0
-
- ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- ld r28, 0(rp)
- ld r29, 8(rp)
- mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- addc r0, r0, r28
- adde r7, r7, r26
- addze r12, r8
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- addi rp, rp, 16
- ld r9, 24(up)
- ld r27, 32(up)
- addi up, up, 40
- bdz L(ea0)
-
- ALIGN(16)
-L(ta0): mulld r0, r9, r6
- mulhdu r26, r9, r6 C 9
- mulld r7, r27, r6
- mulhdu r8, r27, r6 C 27
- ld r9, 0(up)
- ld r28, 0(rp)
- ld r27, 8(up)
- ld r29, 8(rp)
- adde r0, r0, r12 C 0 12
- adde r7, r7, r26 C 5 7
- mulld r26, r9, r6
- mulhdu r10, r9, r6 C 9
- mulld r11, r27, r6
- mulhdu r12, r27, r6 C 27
- ld r9, 16(up)
- ld r30, 16(rp)
- ld r27, 24(up)
- ld r31, 24(rp)
- adde r26, r26, r8 C 8 5
- adde r11, r11, r10 C 10 11
- addze r12, r12 C 12
- addc r0, r0, r28 C 0 28
- std r0, 0(rp) C 0
- adde r7, r7, r29 C 7 29
- std r7, 8(rp) C 7
- adde r26, r26, r30 C 5 30
- std r26, 16(rp) C 5
- adde r11, r11, r31 C 11 31
- std r11, 24(rp) C 11
- addi up, up, 32
- addi rp, rp, 32
- bdnz L(ta0)
-
-L(ea0): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r28, 0(rp)
- ld r29, 8(rp)
- adde r0, r0, r12
- adde r7, r7, r26
- addze r8, r8
- addc r0, r0, r28
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- addze r8, r8
- std r8, 16(rp)
-
-L(outer_loop_ent_3):
- addi n, n, -1
- addi up_outer, up_outer, 8
- addi rp_outer, rp_outer, 16
-
- mr up, up_outer
- addi rp, rp_outer, 8
-
- srdi r0, n, 2
- mtctr r0
-
- ld r6, 0(up)
- ld r9, 8(up)
- ld r28, 0(rp)
- mulld r0, r9, r6
- mulhdu r12, r9, r6
- addc r0, r0, r28
- std r0, 0(rp)
- addi rp, rp, 8
- ld r9, 16(up)
- ld r27, 24(up)
- addi up, up, 32
- bdz L(ea3)
-
- ALIGN(16)
-L(ta3): mulld r0, r9, r6
- mulhdu r26, r9, r6 C 9
- mulld r7, r27, r6
- mulhdu r8, r27, r6 C 27
- ld r9, 0(up)
- ld r28, 0(rp)
- ld r27, 8(up)
- ld r29, 8(rp)
- adde r0, r0, r12 C 0 12
- adde r7, r7, r26 C 5 7
- mulld r26, r9, r6
- mulhdu r10, r9, r6 C 9
- mulld r11, r27, r6
- mulhdu r12, r27, r6 C 27
- ld r9, 16(up)
- ld r30, 16(rp)
- ld r27, 24(up)
- ld r31, 24(rp)
- adde r26, r26, r8 C 8 5
- adde r11, r11, r10 C 10 11
- addze r12, r12 C 12
- addc r0, r0, r28 C 0 28
- std r0, 0(rp) C 0
- adde r7, r7, r29 C 7 29
- std r7, 8(rp) C 7
- adde r26, r26, r30 C 5 30
- std r26, 16(rp) C 5
- adde r11, r11, r31 C 11 31
- std r11, 24(rp) C 11
- addi up, up, 32
- addi rp, rp, 32
- bdnz L(ta3)
-
-L(ea3): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r28, 0(rp)
- ld r29, 8(rp)
- adde r0, r0, r12
- adde r7, r7, r26
- addze r8, r8
- addc r0, r0, r28
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- addze r8, r8
- std r8, 16(rp)
-
-
-L(outer_loop_ent_2):
- addi n, n, -1
- addi up_outer, up_outer, 8
- addi rp_outer, rp_outer, 16
-
- mr up, up_outer
- addi rp, rp_outer, 8
-
- srdi r0, n, 2
- mtctr r0
-
- addic r0, r0, 0
- li r12, 0 C cy_limb = 0
- ld r6, 0(up)
- ld r9, 8(up)
- ld r27, 16(up)
- bdz L(ea2)
- addi up, up, 24
-
- ALIGN(16)
-L(ta2): mulld r0, r9, r6
- mulhdu r26, r9, r6 C 9
- mulld r7, r27, r6
- mulhdu r8, r27, r6 C 27
- ld r9, 0(up)
- ld r28, 0(rp)
- ld r27, 8(up)
- ld r29, 8(rp)
- adde r0, r0, r12 C 0 12
- adde r7, r7, r26 C 5 7
- mulld r26, r9, r6
- mulhdu r10, r9, r6 C 9
- mulld r11, r27, r6
- mulhdu r12, r27, r6 C 27
- ld r9, 16(up)
- ld r30, 16(rp)
- ld r27, 24(up)
- ld r31, 24(rp)
- adde r26, r26, r8 C 8 5
- adde r11, r11, r10 C 10 11
- addze r12, r12 C 12
- addc r0, r0, r28 C 0 28
- std r0, 0(rp) C 0
- adde r7, r7, r29 C 7 29
- std r7, 8(rp) C 7
- adde r26, r26, r30 C 5 30
- std r26, 16(rp) C 5
- adde r11, r11, r31 C 11 31
- std r11, 24(rp) C 11
- addi up, up, 32
- addi rp, rp, 32
- bdnz L(ta2)
-
-L(ea2): mulld r0, r9, r6
- mulhdu r26, r9, r6
- mulld r7, r27, r6
- mulhdu r8, r27, r6
- ld r28, 0(rp)
- ld r29, 8(rp)
- adde r0, r0, r12
- adde r7, r7, r26
- addze r8, r8
- addc r0, r0, r28
- std r0, 0(rp)
- adde r7, r7, r29
- std r7, 8(rp)
- addze r8, r8
- std r8, 16(rp)
-
- b L(outer_loop)
-
-L(outer_end):
- ld r6, 0(up)
- ld r9, 8(up)
- ld r11, 0(rp)
- mulld r0, r9, r6
- mulhdu r8, r9, r6
- addc r0, r0, r11
- std r0, 0(rp)
- addze r8, r8
- std r8, 8(rp)
-
-define(`rp', `rp_saved')
-define(`up', `r5')
-define(`n', `r6')
-define(`climb', `r0')
-
- addi r4, rp_saved, 8
- mr r5, up_saved
- mr r6, n_saved
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 2 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C put loop count into ctr
- beq cr0, L(xb0)
- blt cr6, L(xb1)
- beq cr6, L(xb2)
-
-L(xb3): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- addi up, up, 24
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- ld r10, 8(rp)
- ld r11, 16(rp)
- ld r6, 24(rp)
- ld r7, 32(rp)
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- addze climb, r29
- addc r10, r10, r25
- adde r11, r11, r26
- adde r6, r6, r27
- adde r7, r7, r28
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- std r6, 24(rp)
- std r7, 32(rp)
- addi rp, rp, 40
- bdnz L(top)
- b L(end)
-
-L(xb2): ld r6, 0(up)
- ld r7, 8(up)
- addi up, up, 16
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- ld r10, 8(rp)
- ld r11, 16(rp)
- addc r10, r10, r10
- adde r11, r11, r11
- addze climb, r27
- addc r10, r10, r25
- adde r11, r11, r26
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- addi rp, rp, 24
- bdnz L(top)
- b L(end)
-
-L(xb0): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r10, 8(rp)
- ld r11, 16(rp)
- ld r6, 24(rp)
- ld r7, 32(rp)
- ld r12, 40(rp)
- ld r23, 48(rp)
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- adde r12, r12, r12
- adde r23, r23, r23
- addze climb, r31
- std r24, 0(rp)
- addc r10, r10, r25
- std r10, 8(rp)
- adde r11, r11, r26
- std r11, 16(rp)
- adde r6, r6, r27
- std r6, 24(rp)
- adde r7, r7, r28
- std r7, 32(rp)
- adde r12, r12, r29
- std r12, 40(rp)
- adde r23, r23, r30
- std r23, 48(rp)
- addi rp, rp, 56
- bdnz L(top)
- b L(end)
-
-L(xb1): ld r6, 0(up)
- addi up, up, 8
- mulld r24, r6, r6
- mulhdu climb, r6, r6
- std r24, 0(rp)
- addic rp, rp, 8 C clear carry as side-effect
-
- ALIGN(32)
-L(top): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r8, 0(rp)
- ld r9, 8(rp)
- adde r8, r8, r8
- adde r9, r9, r9
- ld r10, 16(rp)
- ld r11, 24(rp)
- adde r10, r10, r10
- adde r11, r11, r11
- ld r6, 32(rp)
- ld r7, 40(rp)
- adde r6, r6, r6
- adde r7, r7, r7
- ld r12, 48(rp)
- ld r23, 56(rp)
- adde r12, r12, r12
- adde r23, r23, r23
- addze r31, r31
- addc r8, r8, climb
- std r8, 0(rp)
- adde r9, r9, r24
- std r9, 8(rp)
- adde r10, r10, r25
- std r10, 16(rp)
- adde r11, r11, r26
- std r11, 24(rp)
- adde r6, r6, r27
- std r6, 32(rp)
- adde r7, r7, r28
- std r7, 40(rp)
- adde r12, r12, r29
- std r12, 48(rp)
- adde r23, r23, r30
- std r23, 56(rp)
- mr climb, r31
- addi rp, rp, 64
- bdnz L(top)
-
-L(end): addze climb, climb
- std climb, 0(rp)
-
- ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- ld r26, -48(r1)
- ld r25, -56(r1)
- ld r24, -64(r1)
- ld r23, -72(r1)
- ld r22, -80(r1)
- ld r21, -88(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/sublsh1_n.asm b/gmp/mpn/powerpc64/mode64/sublsh1_n.asm
new file mode 100644
index 0000000000..69e0dfa5a2
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/sublsh1_n.asm
@@ -0,0 +1,83 @@
+dnl PowerPC-64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER4/PPC970: 4 (2.0 c/l should be possible)
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`u0',`r8')
+define(`v0',`r10')
+define(`v1',`r11')
+
+ASM_START()
+PROLOGUE(mpn_sublsh1_n)
+ mtctr r6 C put n in ctr
+
+ ld v0, 0(vp) C load v limb
+ ld u0, 0(up) C load u limb
+ addic up, up, -8 C update up; set cy
+ addi rp, rp, -8 C update rp
+ sldi s1, v0, 1
+ bdz L(end) C If done, skip loop
+
+L(oop): ld v1, 8(vp) C load v limb
+ subfe s1, s1, u0 C add limbs with cy, set cy
+ std s1, 8(rp) C store result limb
+ srdi s0, v0, 63 C shift down previous v limb
+ ldu u0, 16(up) C load u limb and update up
+ rldimi s0, v1, 1, 0 C left shift v limb and merge with prev v limb
+
+ bdz L(exit) C decrement ctr and exit if done
+
+ ldu v0, 16(vp) C load v limb and update vp
+ subfe s0, s0, u0 C add limbs with cy, set cy
+ stdu s0, 16(rp) C store result limb and update rp
+ srdi s1, v1, 63 C shift down previous v limb
+ ld u0, 8(up) C load u limb
+ rldimi s1, v0, 1, 0 C left shift v limb and merge with prev v limb
+
+ bdnz L(oop) C decrement ctr and loop back
+
+L(end): subfe r7, s1, u0
+ std r7, 8(rp) C store last result limb
+ srdi r3, v0, 63
+ subfze r3, r3
+ neg r3, r3
+ blr
+L(exit): subfe r7, s0, u0
+ std r7, 16(rp) C store last result limb
+ srdi r3, v1, 63
+ subfze r3, r3
+ neg r3, r3
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/mode64/submul_1.asm b/gmp/mpn/powerpc64/mode64/submul_1.asm
new file mode 100644
index 0000000000..3c1e8a5c82
--- /dev/null
+++ b/gmp/mpn/powerpc64/mode64/submul_1.asm
@@ -0,0 +1,62 @@
+dnl PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 6-18
+C POWER4/PPC970: 10
+C POWER5: 10.5
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+define(`cy', `r7')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ li cy, 0 C cy_limb = 0
+
+PROLOGUE(mpn_submul_1c)
+ mtctr n
+ addic r0, r0, 0
+ addi rp, rp, -8
+ ALIGN(16)
+L(top):
+ ld r0, 0(up)
+ ld r10, 8(rp)
+ mulld r9, r0, vl
+ mulhdu r5, r0, vl
+ adde r9, r9, cy
+ addi up, up, 8
+ addze cy, r5
+ subf r12, r9, r10
+ not r0, r10
+ addc r11, r9, r0 C inverted carry from subf
+ stdu r12, 8(rp)
+ bdnz L(top)
+
+ addze r3, cy
+ blr
+EPILOGUE(mpn_submul_1)
+EPILOGUE(mpn_submul_1c)
diff --git a/gmp/mpn/powerpc64/p6/lshift.asm b/gmp/mpn/powerpc64/p6/lshift.asm
deleted file mode 100644
index 1a200fb346..0000000000
--- a/gmp/mpn/powerpc64/p6/lshift.asm
+++ /dev/null
@@ -1,132 +0,0 @@
-dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
-
-dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 4
-
-C TODO
-C * Micro-optimise header code
-C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236
-C bytes, 4-way code would become about 50% larger.
-
-C INPUT PARAMETERS
-define(`rp_param', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
-
-define(`tnc',`r0')
-define(`retval',`r3')
-define(`rp', `r7')
-
-ASM_START()
-PROLOGUE(mpn_lshift,toc)
-
-ifdef(`HAVE_ABI_mode32',`
- rldicl n, n, 0,32 C FIXME: avoid this zero extend
-')
- mflr r12
- sldi r8, n, 3
- sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
- LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
- add up, up, r8 C make up point at end of up[]
- add r11, r11, r10 C address of L(oN) for N = cnt
- srdi r10, n, 1
- add rp, rp_param, r8 C make rp point at end of rp[]
- subfic tnc, cnt, 64
- rlwinm. r8, n, 0,31,31 C extract bit 0
- mtctr r10
- beq L(evn)
-
-L(odd): ld r9, -8(up)
- cmpdi cr0, n, 1 C n = 1?
- beq L(1)
- ld r8, -16(up)
- addi r11, r11, -84 C L(o1) - L(e1) - 64
- mtlr r11
- srd r3, r9, tnc C retval
- addi up, up, 8
- addi rp, rp, -8
- blr C branch to L(oN)
-
-L(evn): ld r8, -8(up)
- ld r9, -16(up)
- addi r11, r11, -64
- mtlr r11
- srd r3, r8, tnc C retval
- blr C branch to L(eN)
-
-L(1): srd r3, r9, tnc C retval
- sld r8, r9, cnt
- std r8, -8(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-
-
-define(SHIFT,`
-L(lo$1):ld r8, -24(up)
- std r11, -8(rp)
- addi rp, rp, -16
-L(o$1): srdi r10, r8, eval(64-$1)
- rldimi r10, r9, $1, 0
- ld r9, -32(up)
- addi up, up, -16
- std r10, 0(rp)
-L(e$1): srdi r11, r9, eval(64-$1)
- rldimi r11, r8, $1, 0
- bdnz L(lo$1)
- std r11, -8(rp)
- sldi r10, r9, $1
- b L(com)
- nop
- nop
-')
-
- ALIGN(64)
-forloop(`i',1,63,`SHIFT(i)')
-
-L(com): std r10, -16(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/powerpc64/p6/lshiftc.asm b/gmp/mpn/powerpc64/p6/lshiftc.asm
deleted file mode 100644
index e4b3caaab8..0000000000
--- a/gmp/mpn/powerpc64/p6/lshiftc.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
-
-dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 4
-
-C TODO
-C * Micro-optimise header code
-C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236
-C bytes, 4-way code would become about 50% larger.
-
-C INPUT PARAMETERS
-define(`rp_param', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
-
-define(`tnc',`r0')
-define(`retval',`r3')
-define(`rp', `r7')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc,toc)
-
-ifdef(`HAVE_ABI_mode32',`
- rldicl n, n, 0,32 C FIXME: avoid this zero extend
-')
- mflr r12
- sldi r8, n, 3
- sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
- LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
- add up, up, r8 C make up point at end of up[]
- add r11, r11, r10 C address of L(oN) for N = cnt
- srdi r10, n, 1
- add rp, rp_param, r8 C make rp point at end of rp[]
- subfic tnc, cnt, 64
- rlwinm. r8, n, 0,31,31 C extract bit 0
- mtctr r10
- beq L(evn)
-
-L(odd): ld r9, -8(up)
- cmpdi cr0, n, 1 C n = 1?
- beq L(1)
- ld r8, -16(up)
- addi r11, r11, -88 C L(o1) - L(e1) - 64
- mtlr r11
- srd r3, r9, tnc C retval
- addi up, up, 8
- addi rp, rp, -8
- blr C branch to L(oN)
-
-L(evn): ld r8, -8(up)
- ld r9, -16(up)
- addi r11, r11, -64
- mtlr r11
- srd r3, r8, tnc C retval
- blr C branch to L(eN)
-
-L(1): srd r3, r9, tnc C retval
- sld r8, r9, cnt
- nor r8, r8, r8
- std r8, -8(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-
-
-define(SHIFT,`
-L(lo$1):ld r8, -24(up)
- nor r11, r11, r11
- std r11, -8(rp)
- addi rp, rp, -16
-L(o$1): srdi r10, r8, eval(64-$1)
- rldimi r10, r9, $1, 0
- ld r9, -32(up)
- addi up, up, -16
- nor r10, r10, r10
- std r10, 0(rp)
-L(e$1): srdi r11, r9, eval(64-$1)
- rldimi r11, r8, $1, 0
- bdnz L(lo$1)
- sldi r10, r9, $1
- b L(com)
- nop
-')
-
- ALIGN(64)
-forloop(`i',1,63,`SHIFT(i)')
-
-L(com): nor r11, r11, r11
- nor r10, r10, r10
- std r11, -8(rp)
- std r10, -16(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/powerpc64/p6/rshift.asm b/gmp/mpn/powerpc64/p6/rshift.asm
deleted file mode 100644
index 9e848c1fc7..0000000000
--- a/gmp/mpn/powerpc64/p6/rshift.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-dnl PowerPC-64 mpn_rshift -- rp[] = up[] << cnt
-
-dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2
-C POWER6 3.5 (mysteriously 3.0 for cnt=1)
-
-C TODO
-C * Micro-optimise header code
-C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4248
-C bytes, 4-way code would become about 50% larger.
-
-C INPUT PARAMETERS
-define(`rp_param', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
-
-define(`tnc',`r0')
-define(`retval',`r3')
-define(`rp', `r7')
-
-ASM_START()
-PROLOGUE(mpn_rshift,toc)
-
-ifdef(`HAVE_ABI_mode32',`
- rldicl n, n, 0,32 C FIXME: avoid this zero extend
-')
- mflr r12
- LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1)
- sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block
- add r11, r11, r10 C address of L(oN) for N = cnt
- srdi r10, n, 1
- mr rp, rp_param
- subfic tnc, cnt, 64
- rlwinm. r8, n, 0,31,31 C extract bit 0
- mtctr r10
- beq L(evn)
-
-L(odd): ld r9, 0(up)
- cmpdi cr0, n, 1 C n = 1?
- beq L(1)
- ld r8, 8(up)
- addi r11, r11, -84 C L(o1) - L(e1) - 64
- mtlr r11
- sld r3, r9, tnc C retval
- addi up, up, 8
- addi rp, rp, 8
- blr C branch to L(oN)
-
-L(evn): ld r8, 0(up)
- ld r9, 8(up)
- addi r11, r11, -64
- mtlr r11
- sld r3, r8, tnc C retval
- addi up, up, 16
- blr C branch to L(eN)
-
-L(1): sld r3, r9, tnc C retval
- srd r8, r9, cnt
- std r8, 0(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-
-
-define(SHIFT,`
-L(lo$1):ld r8, 0(up)
- std r11, 0(rp)
- addi rp, rp, 16
-L(o$1): srdi r10, r9, $1
- rldimi r10, r8, eval(64-$1), 0
- ld r9, 8(up)
- addi up, up, 16
- std r10, -8(rp)
-L(e$1): srdi r11, r8, $1
- rldimi r11, r9, eval(64-$1), 0
- bdnz L(lo$1)
- std r11, 0(rp)
- srdi r10, r9, $1
- b L(com)
- nop
- nop
-')
-
- ALIGN(64)
-forloop(`i',1,63,`SHIFT(i)')
-
-L(com): std r10, 8(rp)
- mtlr r12
-ifdef(`HAVE_ABI_mode32',
-` mr r4, r3
- srdi r3, r3, 32
-')
- blr
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/powerpc64/p7/copyd.asm b/gmp/mpn/powerpc64/p7/copyd.asm
deleted file mode 100644
index f04ca586e8..0000000000
--- a/gmp/mpn/powerpc64/p7/copyd.asm
+++ /dev/null
@@ -1,128 +0,0 @@
-dnl PowerPC-64 mpn_copyd.
-
-dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 ?
-C POWER6 1.25
-C POWER7 1.09
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-
-ASM_START()
-PROLOGUE(mpn_copyd)
-
-ifdef(`HAVE_ABI_mode32',
-` rldicl n, n, 0,32')
-
- sldi r0, n, 3
- add up, up, r0 C point at u[] end
- add rp, rp, r0 C point at r[] end
-
- cmpdi cr0, n, 4
- blt L(sml)
-
- addi r10, n, 4
- srdi r10, r10, 3
- mtctr r10
-
- andi. r0, n, 1
- rlwinm r11, n, 0,30,30
- rlwinm r12, n, 0,29,29
- cmpdi cr6, r11, 0
- cmpdi cr7, r12, 0
-
- beq cr0, L(xx0)
-L(xx1): ld r6, -8(up)
- addi up, up, -8
- std r6, -8(rp)
- addi rp, rp, -8
-
-L(xx0): bne cr6, L(x10)
-L(x00): ld r6, -8(up)
- ld r7, -16(up)
- bne cr7, L(100)
-L(000): addi rp, rp, 32
- b L(lo0)
-L(100): addi up, up, 32
- b L(lo4)
-L(x10): ld r8, -8(up)
- ld r9, -16(up)
- bne cr7, L(110)
-L(010): addi up, up, -16
- addi rp, rp, 16
- b L(lo2)
-L(110): addi up, up, 16
- addi rp, rp, 48
- b L(lo6)
-
-L(sml): cmpdi cr0, n, 0
- beqlr- cr0
- mtctr n
-L(t): ld r6, -8(up)
- addi up, up, -8
- std r6, -8(rp)
- addi rp, rp, -8
- bdnz L(t)
- blr
-
- ALIGN(32)
-L(top): std r6, -8(rp)
- std r7, -16(rp)
-L(lo2): ld r6, -8(up)
- ld r7, -16(up)
- std r8, -24(rp)
- std r9, -32(rp)
-L(lo0): ld r8, -24(up)
- ld r9, -32(up)
- std r6, -40(rp)
- std r7, -48(rp)
-L(lo6): ld r6, -40(up)
- ld r7, -48(up)
- std r8, -56(rp)
- std r9, -64(rp)
- addi rp, rp, -64
-L(lo4): ld r8, -56(up)
- ld r9, -64(up)
- addi up, up, -64
- bdnz L(top)
-
-L(end): std r6, -8(rp)
- std r7, -16(rp)
- std r8, -24(rp)
- std r9, -32(rp)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/copyi.asm b/gmp/mpn/powerpc64/p7/copyi.asm
deleted file mode 100644
index 854cf9f809..0000000000
--- a/gmp/mpn/powerpc64/p7/copyi.asm
+++ /dev/null
@@ -1,129 +0,0 @@
-dnl PowerPC-64 mpn_copyi.
-
-dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 ?
-C POWER6 1.25
-C POWER7 1.09
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-
-C TODO
-C * Try rolling the two loop leading std to the end, allowing the code to
-C handle also n = 2.
-C * Consider using 4 pointers, schedule ptr update early wrt use.
-
-ASM_START()
-PROLOGUE(mpn_copyi)
-
-ifdef(`HAVE_ABI_mode32',
-` rldicl n, n, 0,32')
-
- cmpdi cr0, n, 4
- blt L(sml)
-
- addi r10, n, 4
- srdi r10, r10, 3
- mtctr r10
-
- andi. r0, n, 1
- rlwinm r11, n, 0,30,30
- rlwinm r12, n, 0,29,29
- cmpdi cr6, r11, 0
- cmpdi cr7, r12, 0
-
- beq cr0, L(xx0)
-L(xx1): ld r6, 0(up)
- addi up, up, 8
- std r6, 0(rp)
- addi rp, rp, 8
-
-L(xx0): bne cr6, L(x10)
-L(x00): ld r6, 0(up)
- ld r7, 8(up)
- bne cr7, L(100)
-L(000): addi rp, rp, -32
- b L(lo0)
-L(100): addi up, up, -32
- b L(lo4)
-L(x10): ld r8, 0(up)
- ld r9, 8(up)
- bne cr7, L(110)
-L(010): addi up, up, 16
- addi rp, rp, -16
- b L(lo2)
-L(110): addi up, up, -16
- addi rp, rp, -48
- b L(lo6)
-
-L(sml): cmpdi cr0, n, 0
- beqlr- cr0
- mtctr n
-L(t): ld r6, 0(up)
- addi up, up, 8
- std r6, 0(rp)
- addi rp, rp, 8
- bdnz L(t)
- blr
-
- ALIGN(32)
-L(top): std r6, 0(rp)
- std r7, 8(rp)
-L(lo2): ld r6, 0(up)
- ld r7, 8(up)
- std r8, 16(rp)
- std r9, 24(rp)
-L(lo0): ld r8, 16(up)
- ld r9, 24(up)
- std r6, 32(rp)
- std r7, 40(rp)
-L(lo6): ld r6, 32(up)
- ld r7, 40(up)
- std r8, 48(rp)
- std r9, 56(rp)
- addi rp, rp, 64
-L(lo4): ld r8, 48(up)
- ld r9, 56(up)
- addi up, up, 64
- bdnz L(top)
-
-L(end): std r6, 0(rp)
- std r7, 8(rp)
- std r8, 16(rp)
- std r9, 24(rp)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/hamdist.asm b/gmp/mpn/powerpc64/p7/hamdist.asm
deleted file mode 100644
index 5af98946f7..0000000000
--- a/gmp/mpn/powerpc64/p7/hamdist.asm
+++ /dev/null
@@ -1,110 +0,0 @@
-dnl PowerPC-64 mpn_hamdist.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 -
-C POWER4/PPC970 -
-C POWER5 -
-C POWER6 -
-C POWER7 2.87
-
-define(`up', r3)
-define(`vp', r4)
-define(`n', r5)
-
-ASM_START()
-PROLOGUE(mpn_hamdist)
- std r30, -16(r1)
- std r31, -8(r1)
-
- addi r0, n, 1
-ifdef(`HAVE_ABI_mode32',
-` rldicl r0, r0, 63,33', C ...branch count
-` srdi r0, r0, 1') C ...for ctr
- mtctr r0
-
- andi. r0, n, 1
-
- li r0, 0
- li r12, 0
-
- beq L(evn)
-
-L(odd): ld r6, 0(up)
- addi up, up, 8
- ld r8, 0(vp)
- addi vp, vp, 8
- xor r10, r6, r8
- popcntd r0, r10
- bdz L(e1)
-
-L(evn): ld r6, 0(up)
- ld r8, 0(vp)
- ld r7, 8(up)
- ld r9, 8(vp)
- xor r10, r6, r8
- addi up, up, 16
- addi vp, vp, 16
- li r30, 0
- li r31, 0
- bdz L(end)
-
- nop
- nop
-C ALIGN(16)
-L(top): add r0, r0, r30
- ld r6, 0(up)
- ld r8, 0(vp)
- xor r11, r7, r9
- popcntd r30, r10
- add r12, r12, r31
- ld r7, 8(up)
- ld r9, 8(vp)
- xor r10, r6, r8
- popcntd r31, r11
- addi up, up, 16
- addi vp, vp, 16
- bdnz L(top)
-
-L(end): add r0, r0, r30
- xor r11, r7, r9
- popcntd r30, r10
- add r12, r12, r31
- popcntd r31, r11
-
- add r0, r0, r30
- add r12, r12, r31
-L(e1): add r3, r0, r12
- ld r30, -16(r1)
- ld r31, -8(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/p7/popcount.asm b/gmp/mpn/powerpc64/p7/popcount.asm
deleted file mode 100644
index eac72a6493..0000000000
--- a/gmp/mpn/powerpc64/p7/popcount.asm
+++ /dev/null
@@ -1,90 +0,0 @@
-dnl PowerPC-64 mpn_popcount.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 -
-C POWER4/PPC970 -
-C POWER5 -
-C POWER6 -
-C POWER7 2
-
-define(`up', r3)
-define(`n', r4)
-
-ASM_START()
-PROLOGUE(mpn_popcount)
- addi r0, n, 1
-ifdef(`HAVE_ABI_mode32',
-` rldicl r0, r0, 63,33', C ...branch count
-` srdi r0, r0, 1') C ...for ctr
- mtctr r0
-
- andi. r0, n, 1
-
- li r0, 0
- li r12, 0
- beq L(evn)
-
-L(odd): ld r4, 0(up)
- addi up, up, 8
- popcntd r0, r4
- bdz L(e1)
-
-L(evn): ld r4, 0(up)
- ld r5, 8(up)
- popcntd r8, r4
- popcntd r9, r5
- bdz L(e2)
-
- ld r4, 16(up)
- ld r5, 24(up)
- bdz L(e4)
- addi up, up, 32
-
-L(top): add r0, r0, r8
- popcntd r8, r4
- ld r4, 0(up)
- add r12, r12, r9
- popcntd r9, r5
- ld r5, 8(up)
- addi up, up, 16
- bdnz L(top)
-
-L(e4): add r0, r0, r8
- popcntd r8, r4
- add r12, r12, r9
- popcntd r9, r5
-L(e2): add r0, r0, r8
- add r12, r12, r9
-L(e1): add r3, r0, r12
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/rshift.asm b/gmp/mpn/powerpc64/rshift.asm
index 7654a16ae8..e73640d08c 100644
--- a/gmp/mpn/powerpc64/rshift.asm
+++ b/gmp/mpn/powerpc64/rshift.asm
@@ -1,207 +1,107 @@
dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
-dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
-include(`../config.m4')
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
-C POWER7 2.15
+include(`../config.m4')
-C TODO
-C * Try to reduce the number of needed live registers
-C * Micro-optimise header code
-C * Keep in synch with lshift.asm and lshiftc.asm
+C cycles/limb
+C POWER3/PPC630: 1.5
+C POWER4/PPC970: 3.0
C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`cnt', `r6')
+define(`rp',`r3')
+define(`up',`r4')
+define(`n',`r5')
+define(`cnt',`r6')
+
+define(`tnc',`r5')
+define(`v0',`r0')
+define(`v1',`r7')
+define(`u0',`r8')
+define(`u1',`r9')
+define(`h0',`r10')
+define(`h1',`r11')
-define(`tnc',`r0')
-define(`u0',`r30')
-define(`u1',`r31')
-define(`retval',`r5')
ASM_START()
PROLOGUE(mpn_rshift)
- std r31, -8(r1)
- std r30, -16(r1)
- subfic tnc, cnt, 64
-C sldi r30, n, 3 C byte count corresponding to n
-C add rp, rp, r30 C rp = rp + n
-C add up, up, r30 C up = up + n
- rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
- cmpdi cr6, r30, 2
- addi r31, n, 3 C compute count...
- ld r10, 0(up) C load 1st limb for b00...b11
- sld retval, r10, tnc
ifdef(`HAVE_ABI_mode32',
-` rldicl r31, r31, 62,34', C ...branch count
-` srdi r31, r31, 2') C ...for ctr
- mtctr r31 C copy count into ctr
- beq cr0, L(b00)
- blt cr6, L(b01)
- ld r11, 8(up) C load 2nd limb for b10 and b11
- beq cr6, L(b10)
-
- ALIGN(16)
-L(b11): srd r8, r10, cnt
- sld r9, r11, tnc
- ld u1, 16(up)
- addi up, up, 24
- srd r12, r11, cnt
- sld r7, u1, tnc
+` rldicl n, n, 0, 32') C zero extend n
+ mtctr n C copy n to count register
addi rp, rp, -16
- bdnz L(gt3)
+ subfic tnc, cnt, 64 C reverse shift count
- or r11, r8, r9
- srd r8, u1, cnt
- b L(cj3)
+ ld u0, 0(up)
+ srd h0, u0, cnt
+ sld r12, u0, tnc C return value
+ bdz L(1) C jump for n = 1
- ALIGN(16)
-L(gt3): ld u0, 0(up)
- or r11, r8, r9
- srd r8, u1, cnt
- sld r9, u0, tnc
ld u1, 8(up)
- or r10, r12, r7
- b L(L11)
-
- ALIGN(32)
-L(b10): srd r12, r10, cnt
- addi rp, rp, -24
- sld r7, r11, tnc
- bdnz L(gt2)
-
- srd r8, r11, cnt
- or r10, r12, r7
- b L(cj2)
-
-L(gt2): ld u0, 16(up)
- srd r8, r11, cnt
- sld r9, u0, tnc
- ld u1, 24(up)
- or r10, r12, r7
- srd r12, u0, cnt
- sld r7, u1, tnc
- ld u0, 32(up)
- or r11, r8, r9
- addi up, up, 16
- b L(L10)
-
- ALIGN(16)
-L(b00): ld u1, 8(up)
- srd r12, r10, cnt
- sld r7, u1, tnc
- ld u0, 16(up)
- srd r8, u1, cnt
- sld r9, u0, tnc
- ld u1, 24(up)
- or r10, r12, r7
- srd r12, u0, cnt
- sld r7, u1, tnc
- addi rp, rp, -8
- bdz L(cj4)
-
-L(gt4): addi up, up, 32
- ld u0, 0(up)
- or r11, r8, r9
- b L(L00)
-
- ALIGN(16)
-L(b01): bdnz L(gt1)
- srd r8, r10, cnt
- std r8, 0(rp)
- b L(ret)
-
-L(gt1): ld u0, 8(up)
- srd r8, r10, cnt
- sld r9, u0, tnc
- ld u1, 16(up)
- srd r12, u0, cnt
- sld r7, u1, tnc
- ld u0, 24(up)
- or r11, r8, r9
- srd r8, u1, cnt
- sld r9, u0, tnc
- ld u1, 32(up)
- addi up, up, 40
- or r10, r12, r7
- bdz L(end)
-
- ALIGN(32)
-L(top): srd r12, u0, cnt
- sld r7, u1, tnc
- ld u0, 0(up)
- std r11, 0(rp)
- or r11, r8, r9
-L(L00): srd r8, u1, cnt
- sld r9, u0, tnc
+ bdz L(2) C jump for n = 2
+
+ ldu u0, 16(up)
+ bdz L(end) C jump for n = 3
+
+L(oop): sld v1, u1, tnc
+ srd h1, u1, cnt
ld u1, 8(up)
- std r10, 8(rp)
- or r10, r12, r7
-L(L11): srd r12, u0, cnt
- sld r7, u1, tnc
- ld u0, 16(up)
- std r11, 16(rp)
- or r11, r8, r9
-L(L10): srd r8, u1, cnt
- sld r9, u0, tnc
- ld u1, 24(up)
- addi up, up, 32
- std r10, 24(rp)
- addi rp, rp, 32
- or r10, r12, r7
- bdnz L(top)
-
- ALIGN(32)
-L(end): srd r12, u0, cnt
- sld r7, u1, tnc
- std r11, 0(rp)
-L(cj4): or r11, r8, r9
- srd r8, u1, cnt
- std r10, 8(rp)
-L(cj3): or r10, r12, r7
- std r11, 16(rp)
-L(cj2): std r10, 24(rp)
- std r8, 32(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
+ or h0, v1, h0
+ stdu h0, 16(rp)
+
+ bdz L(exit)
+
+ sld v0, u0, tnc
+ srd h0, u0, cnt
+ ldu u0, 16(up)
+ or h1, v0, h1
+ std h1, 8(rp)
+
+ bdnz L(oop)
+
+L(end): sld v1, u1, tnc
+ srd h1, u1, cnt
+ or h0, v1, h0
+ stdu h0, 16(rp)
+ sld v0, u0, tnc
+ srd h0, u0, cnt
+ or h1, v0, h1
+ std h1, 8(rp)
+L(1): std h0, 16(rp)
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, r12, 32
+ mr r4, r12
+',` mr r3, r12
+')
+ blr
+
+L(exit): sld v0, u0, tnc
+ srd h0, u0, cnt
+ or h1, v0, h1
+ std h1, 8(rp)
+L(2): sld v1, u1, tnc
+ srd h1, u1, cnt
+ or h0, v1, h0
+ stdu h0, 16(rp)
+ std h1, 8(rp)
ifdef(`HAVE_ABI_mode32',
-` srdi r3, retval, 32
- mr r4, retval
-',` mr r3, retval')
+` srdi r3, r12, 32
+ mr r4, r12
+',` mr r3, r12
+')
blr
EPILOGUE()
diff --git a/gmp/mpn/powerpc64/sec_tabselect.asm b/gmp/mpn/powerpc64/sec_tabselect.asm
deleted file mode 100644
index 085577ca9b..0000000000
--- a/gmp/mpn/powerpc64/sec_tabselect.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl PowerPC-64 mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 1.75
-C POWER4/PPC970 2.0
-C POWER5 ?
-C POWER6 5.0
-C POWER7 1.75
-
-define(`rp', `r3')
-define(`tp', `r4')
-define(`n', `r5')
-define(`nents', `r6')
-define(`which', `r7')
-
-define(`i', `r8')
-define(`j', `r9')
-define(`stride', `r12')
-define(`mask', `r11')
-
-
-ASM_START()
-PROLOGUE(mpn_sec_tabselect)
- addic. j, n, -4 C outer loop induction variable
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- sldi stride, n, 3
-
- blt cr0, L(outer_end)
-L(outer_top):
- mtctr nents
- mr r10, tp
- li r28, 0
- li r29, 0
- li r30, 0
- li r31, 0
- addic. j, j, -4 C outer loop induction variable
- mr i, which
-
- ALIGN(16)
-L(top): addic i, i, -1 C set carry iff i != 0
- subfe mask, mask, mask
- ld r0, 0(tp)
- ld r27, 8(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r28, r28, r0
- or r29, r29, r27
- ld r0, 16(tp)
- ld r27, 24(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r30, r30, r0
- or r31, r31, r27
- add tp, tp, stride
- bdnz L(top)
-
- std r28, 0(rp)
- std r29, 8(rp)
- std r30, 16(rp)
- std r31, 24(rp)
- addi tp, r10, 32
- addi rp, rp, 32
- bge cr0, L(outer_top)
-L(outer_end):
-
- rldicl. r0, n, 63, 63
- beq cr0, L(b0x)
-L(b1x): mtctr nents
- mr r10, tp
- li r28, 0
- li r29, 0
- mr i, which
- ALIGN(16)
-L(tp2): addic i, i, -1
- subfe mask, mask, mask
- ld r0, 0(tp)
- ld r27, 8(tp)
- and r0, r0, mask
- and r27, r27, mask
- or r28, r28, r0
- or r29, r29, r27
- add tp, tp, stride
- bdnz L(tp2)
- std r28, 0(rp)
- std r29, 8(rp)
- addi tp, r10, 16
- addi rp, rp, 16
-
-L(b0x): rldicl. r0, n, 0, 63
- beq cr0, L(b00)
-L(b01): mtctr nents
- mr r10, tp
- li r28, 0
- mr i, which
- ALIGN(16)
-L(tp1): addic i, i, -1
- subfe mask, mask, mask
- ld r0, 0(tp)
- and r0, r0, mask
- or r28, r28, r0
- add tp, tp, stride
- bdnz L(tp1)
- std r28, 0(rp)
-
-L(b00): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- blr
-EPILOGUE()
diff --git a/gmp/mpn/powerpc64/sqr_diagonal.asm b/gmp/mpn/powerpc64/sqr_diagonal.asm
new file mode 100644
index 0000000000..07f60e0dd5
--- /dev/null
+++ b/gmp/mpn/powerpc64/sqr_diagonal.asm
@@ -0,0 +1,55 @@
+dnl PowerPC-64 mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630: 18
+C POWER4/PPC970: 8
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C n r5
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ifdef(`HAVE_ABI_mode32',
+` rldicl r5, r5, 0, 32') C zero extend n
+ mtctr r5
+ ld r0, 0(r4)
+ bdz L(end)
+ ALIGN(16)
+
+L(top): mulld r5, r0, r0
+ mulhdu r6, r0, r0
+ ld r0, 8(r4)
+ addi r4, r4, 8
+ std r5, 0(r3)
+ std r6, 8(r3)
+ addi r3, r3, 16
+ bdnz L(top)
+
+L(end): mulld r5, r0, r0
+ mulhdu r6, r0, r0
+ std r5, 0(r3)
+ std r6, 8(r3)
+
+ blr
+EPILOGUE()
diff --git a/gmp/mpn/powerpc64/umul.asm b/gmp/mpn/powerpc64/umul.asm
index 7fcc72f18f..516be3d98b 100644
--- a/gmp/mpn/powerpc64/umul.asm
+++ b/gmp/mpn/powerpc64/umul.asm
@@ -1,32 +1,21 @@
-dnl PowerPC-64 umul_ppmm -- support for longlong.h
+dnl PowerPC-64 umul_ppmm -- support for longlong.h
-dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+dnl General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/powerpc64/vmx/popcount.asm b/gmp/mpn/powerpc64/vmx/popcount.asm
index b95fb88b1a..b9f5896fb7 100644
--- a/gmp/mpn/powerpc64/vmx/popcount.asm
+++ b/gmp/mpn/powerpc64/vmx/popcount.asm
@@ -1,43 +1,36 @@
dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
-dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C 7400,7410 (G4): ?
-C 744x,745x (G4+): 1.125
-C 970 (G5): 2.25
+C 7400,7410 (G4): 2.75
+C 744x,745x (G4+): 2.25
+C 970 (G5): 5.3
+
+C STATUS
+C * Works for all sizes and alignments.
C TODO
-C * Rewrite the awkward huge n outer loop code.
+C * Tune the awkward huge n outer loop code.
C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
+C * For the 970, a combined VMX+intop approach might be best.
C * Compress cnsts table in 64-bit mode, only half the values are needed.
define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
@@ -46,11 +39,26 @@ define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
define(`OPERATION_popcount')
-define(`ap', `r3')
-define(`n', `r4')
+ifdef(`OPERATION_popcount',`
+ define(`func',`mpn_popcount')
+ define(`up', `r3')
+ define(`n', `r4')
+ define(`HAM', `dnl')
+')
+ifdef(`OPERATION_hamdist',`
+ define(`func',`mpn_hamdist')
+ define(`up', `r3')
+ define(`vp', `r4')
+ define(`n', `r5')
+ define(`HAM', `$1')
+')
-define(`rtab', `v10')
-define(`cnt4', `v11')
+define(`x01010101',`v2')
+define(`x00110011',`v7')
+define(`x00001111',`v10')
+define(`cnt1',`v11')
+define(`cnt2',`v12')
+define(`cnt4',`v13')
ifelse(GMP_LIMB_BITS,32,`
define(`LIMB32',` $1')
@@ -66,7 +74,7 @@ define(`LIMBS_PER_CHUNK', 0x1000)
define(`LIMBS_CHUNK_THRES', 0x1001)
ASM_START()
-PROLOGUE(mpn_popcount,toc)
+PROLOGUE(mpn_popcount)
mfspr r10, 256
oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
mtspr 256, r0
@@ -77,29 +85,30 @@ ifdef(`HAVE_ABI_mode32',
C Load various constants into vector registers
LEAL( r11, cnsts)
li r12, 16
+ vspltisb cnt1, 1 C 0x0101...01 used as shift count
+ vspltisb cnt2, 2 C 0x0202...02 used as shift count
vspltisb cnt4, 4 C 0x0404...04 used as shift count
-
- li r7, 160
- lvx rtab, 0, r11
+ lvx x01010101, 0, r11 C 0x3333...33
+ lvx x00110011, r12, r11 C 0x5555...55
+ vspltisb x00001111, 15 C 0x0f0f...0f
LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
LIMB64(`cmpd cr7, n, r0 ')
- lvx v0, 0, ap
- addi r7, r11, 80
- rlwinm r6, ap, 2,26,29
+ lvx v0, 0, up
+ addi r7, r11, 96
+ rlwinm r6, up, 2,26,29
lvx v8, r7, r6
vand v0, v0, v8
-LIMB32(`rlwinm r8, ap, 30,30,31 ')
-LIMB64(`rlwinm r8, ap, 29,31,31 ')
- add n, n, r8 C compensate n for rounded down `ap'
+LIMB32(`rlwinm r8, up, 30,30,31 ')
+LIMB64(`rlwinm r8, up, 29,31,31 ')
+ add n, n, r8 C compensate n for rounded down `up'
vxor v1, v1, v1
li r8, 0 C grand total count
- vxor v12, v12, v12 C zero total count
- vxor v13, v13, v13 C zero total count
+ vxor v3, v3, v3 C zero total count
addic. n, n, -LIMBS_PER_VR
ble L(sum)
@@ -111,61 +120,82 @@ C For 64-bit machines, handle huge n that would overflow vsum4ubs
LIMB64(`ble cr7, L(small) ')
LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
LIMB64(`lis n, LIMBS_PER_CHUNK ')
-
- ALIGN(16)
L(small):
+
+
LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
addi r7, r7, 1
mtctr r7 C copy n to count register
b L(ent)
- ALIGN(16)
-L(top):
- lvx v0, 0, ap
-L(ent): lvx v1, r12, ap
- addi ap, ap, 32
- vsrb v8, v0, cnt4
- vsrb v9, v1, cnt4
- vperm v2, rtab, rtab, v0
- vperm v3, rtab, rtab, v8
- vperm v4, rtab, rtab, v1
- vperm v5, rtab, rtab, v9
- vaddubm v6, v2, v3
- vaddubm v7, v4, v5
- vsum4ubs v12, v6, v12
- vsum4ubs v13, v7, v13
+ ALIGN(8)
+L(top): lvx v0, 0, up
+ li r7, 128 C prefetch distance
+L(ent): lvx v1, r12, up
+ addi up, up, 32
+ vsr v4, v0, cnt1
+ vsr v5, v1, cnt1
+ dcbt up, r7 C prefetch
+ vand v8, v4, x01010101
+ vand v9, v5, x01010101
+ vsububm v0, v0, v8 C 64 2-bit accumulators (0..2)
+ vsububm v1, v1, v9 C 64 2-bit accumulators (0..2)
+ vsr v4, v0, cnt2
+ vsr v5, v1, cnt2
+ vand v8, v0, x00110011
+ vand v9, v1, x00110011
+ vand v4, v4, x00110011
+ vand v5, v5, x00110011
+ vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4)
+ vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4)
+ vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8)
+ vsr v9, v8, cnt4
+ vand v6, v8, x00001111
+ vand v9, v9, x00001111
+ vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16)
+ vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields
bdnz L(top)
andi. n, n, eval(LIMBS_PER_2VR-1)
beq L(rt)
- lvx v0, 0, ap
+ lvx v0, 0, up
vxor v1, v1, v1
cmpwi n, LIMBS_PER_VR
ble L(sum)
L(lsum):
vor v1, v0, v0
- lvx v0, r12, ap
+ lvx v0, r12, up
L(sum):
LIMB32(`rlwinm r6, n, 4,26,27 ')
LIMB64(`rlwinm r6, n, 5,26,26 ')
- addi r7, r11, 16
+ addi r7, r11, 32
lvx v8, r7, r6
vand v0, v0, v8
- vsrb v8, v0, cnt4
- vsrb v9, v1, cnt4
- vperm v2, rtab, rtab, v0
- vperm v3, rtab, rtab, v8
- vperm v4, rtab, rtab, v1
- vperm v5, rtab, rtab, v9
- vaddubm v6, v2, v3
- vaddubm v7, v4, v5
- vsum4ubs v12, v6, v12
- vsum4ubs v13, v7, v13
-
- ALIGN(16)
-L(rt): vadduwm v3, v12, v13
+
+ vsr v4, v0, cnt1
+ vsr v5, v1, cnt1
+ vand v8, v4, x01010101
+ vand v9, v5, x01010101
+ vsububm v0, v0, v8 C 64 2-bit accumulators (0..2)
+ vsububm v1, v1, v9 C 64 2-bit accumulators (0..2)
+ vsr v4, v0, cnt2
+ vsr v5, v1, cnt2
+ vand v8, v0, x00110011
+ vand v9, v1, x00110011
+ vand v4, v4, x00110011
+ vand v5, v5, x00110011
+ vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4)
+ vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4)
+ vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8)
+ vsr v9, v8, cnt4
+ vand v6, v8, x00001111
+ vand v9, v9, x00001111
+ vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16)
+ vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields
+
+L(rt):
li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
stvx v3, r7, r1 C FIXME: ...support storing below sp?
@@ -180,8 +210,7 @@ L(rt): vadduwm v3, v12, v13
C Handle outer loop for huge n. We inherit cr7 and r0 from above.
LIMB64(`ble cr7, L(ret)
- vxor v12, v12, v12 C zero total count
- vxor v13, v13, v13 C zero total count
+ vxor v3, v3, v3 C zero total count
mr n, r9
cmpd cr7, n, r0
ble cr7, L(2)
@@ -192,16 +221,17 @@ L(2): srdi r7, n, 2 C loop count corresponding to n
b L(top)
')
- ALIGN(16)
L(ret): mr r3, r8
mtspr 256, r10
blr
EPILOGUE()
DEF_OBJECT(cnsts,16)
-C Counts for vperm
- .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
- .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
C Masks for high end of number
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
diff --git a/gmp/mpn/pyr/add_n.s b/gmp/mpn/pyr/add_n.s
new file mode 100644
index 0000000000..7ac02e6b4d
--- /dev/null
+++ b/gmp/mpn/pyr/add_n.s
@@ -0,0 +1,74 @@
+# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 2
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ movw $-1,tr0 # representation for carry clear
+
+ movw pr3,tr2
+ andw $3,tr2
+ beq Lend0
+ subw tr2,pr3
+
+Loop0: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ addwc (pr2),tr1
+ movw tr1,(pr0)
+
+ subwb tr0,tr0
+ addw $4,pr0
+ addw $4,pr1
+ addw $4,pr2
+ addw $-1,tr2
+ bne Loop0
+
+ mtstw pr3,pr3
+ beq Lend
+Lend0:
+Loop: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ addwc (pr2),tr1
+ movw tr1,(pr0)
+
+ movw 4(pr1),tr1
+ addwc 4(pr2),tr1
+ movw tr1,4(pr0)
+
+ movw 8(pr1),tr1
+ addwc 8(pr2),tr1
+ movw tr1,8(pr0)
+
+ movw 12(pr1),tr1
+ addwc 12(pr2),tr1
+ movw tr1,12(pr0)
+
+ subwb tr0,tr0
+ addw $16,pr0
+ addw $16,pr1
+ addw $16,pr2
+ addw $-4,pr3
+ bne Loop
+Lend:
+ mnegw tr0,pr0
+ ret
diff --git a/gmp/mpn/pyr/addmul_1.s b/gmp/mpn/pyr/addmul_1.s
new file mode 100644
index 0000000000..d40a9e77cf
--- /dev/null
+++ b/gmp/mpn/pyr/addmul_1.s
@@ -0,0 +1,43 @@
+# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 2
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ mova (pr0)[pr2*4],pr0
+ mova (pr1)[pr2*4],pr1
+ mnegw pr2,pr2
+ movw $0,tr3
+
+Loop: movw (pr1)[pr2*4],tr1
+ uemul pr3,tr0
+ addw tr3,tr1
+ movw $0,tr3
+ addwc tr0,tr3
+ movw (pr0)[pr2*0x4],tr0
+ addw tr0,tr1
+ addwc $0,tr3
+ movw tr1,(pr0)[pr2*4]
+ addw $1,pr2
+ bne Loop
+
+ movw tr3,pr0
+ ret
diff --git a/gmp/mpn/pyr/mul_1.s b/gmp/mpn/pyr/mul_1.s
new file mode 100644
index 0000000000..453727f6d8
--- /dev/null
+++ b/gmp/mpn/pyr/mul_1.s
@@ -0,0 +1,40 @@
+# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 2
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ mova (pr0)[pr2*4],pr0
+ mova (pr1)[pr2*4],pr1
+ mnegw pr2,pr2
+ movw $0,tr3
+
+Loop: movw (pr1)[pr2*4],tr1
+ uemul pr3,tr0
+ addw tr3,tr1
+ movw $0,tr3
+ addwc tr0,tr3
+ movw tr1,(pr0)[pr2*4]
+ addw $1,pr2
+ bne Loop
+
+ movw tr3,pr0
+ ret
diff --git a/gmp/mpn/pyr/sub_n.s b/gmp/mpn/pyr/sub_n.s
new file mode 100644
index 0000000000..11f185a81a
--- /dev/null
+++ b/gmp/mpn/pyr/sub_n.s
@@ -0,0 +1,74 @@
+# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+.text
+ .align 2
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ movw $-1,tr0 # representation for carry clear
+
+ movw pr3,tr2
+ andw $3,tr2
+ beq Lend0
+ subw tr2,pr3
+
+Loop0: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ subwb (pr2),tr1
+ movw tr1,(pr0)
+
+ subwb tr0,tr0
+ addw $4,pr0
+ addw $4,pr1
+ addw $4,pr2
+ addw $-1,tr2
+ bne Loop0
+
+ mtstw pr3,pr3
+ beq Lend
+Lend0:
+Loop: rsubw $0,tr0 # restore carry bit from carry-save register
+
+ movw (pr1),tr1
+ subwb (pr2),tr1
+ movw tr1,(pr0)
+
+ movw 4(pr1),tr1
+ subwb 4(pr2),tr1
+ movw tr1,4(pr0)
+
+ movw 8(pr1),tr1
+ subwb 8(pr2),tr1
+ movw tr1,8(pr0)
+
+ movw 12(pr1),tr1
+ subwb 12(pr2),tr1
+ movw tr1,12(pr0)
+
+ subwb tr0,tr0
+ addw $16,pr0
+ addw $16,pr1
+ addw $16,pr2
+ addw $-4,pr3
+ bne Loop
+Lend:
+ mnegw tr0,pr0
+ ret
diff --git a/gmp/mpn/s390_32/README b/gmp/mpn/s390/README
index 59519ba538..59519ba538 100644
--- a/gmp/mpn/s390_32/README
+++ b/gmp/mpn/s390/README
diff --git a/gmp/mpn/s390_32/addmul_1.asm b/gmp/mpn/s390/addmul_1.asm
index 97189a8e76..71d49bbfca 100644
--- a/gmp/mpn/s390_32/addmul_1.asm
+++ b/gmp/mpn/s390/addmul_1.asm
@@ -4,30 +4,19 @@ dnl result to a second limb vector.
dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/s390/gmp-mparam.h b/gmp/mpn/s390/gmp-mparam.h
new file mode 100644
index 0000000000..d738846679
--- /dev/null
+++ b/gmp/mpn/s390/gmp-mparam.h
@@ -0,0 +1,54 @@
+/* IBM s370 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2001, 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+
+/* BITS_PER_MP_LIMB etc generated by configure */
+
+
+/* Generated by tuneup.c, 2001-12-03, gcc 2.95 */
+
+#define MUL_KARATSUBA_THRESHOLD 18
+#define MUL_TOOM3_THRESHOLD 210
+
+#define SQR_BASECASE_THRESHOLD 8
+#define SQR_KARATSUBA_THRESHOLD 40
+#define SQR_TOOM3_THRESHOLD 250
+
+#define DIV_SB_PREINV_THRESHOLD 0
+#define DIV_DC_THRESHOLD 63
+#define POWM_THRESHOLD 63
+
+#define GCD_ACCEL_THRESHOLD 3
+
+#define DIVREM_1_NORM_THRESHOLD 0
+#define DIVREM_1_UNNORM_THRESHOLD 5
+#define MOD_1_NORM_THRESHOLD 0
+#define MOD_1_UNNORM_THRESHOLD 4
+#define USE_PREINV_MOD_1 0
+#define DIVREM_2_THRESHOLD 0
+#define DIVEXACT_1_THRESHOLD 0
+#define MODEXACT_1_ODD_THRESHOLD 0
+
+#define MUL_FFT_TABLE { 432, 992, 1664, 4608, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 448
+#define MUL_FFT_THRESHOLD 3840
+
+#define SQR_FFT_TABLE { 400, 992, 1664, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 416
+#define SQR_FFT_THRESHOLD 3328
diff --git a/gmp/mpn/s390_32/mul_1.asm b/gmp/mpn/s390/mul_1.asm
index e3ad0c59d8..649671b45c 100644
--- a/gmp/mpn/s390_32/mul_1.asm
+++ b/gmp/mpn/s390/mul_1.asm
@@ -4,30 +4,19 @@ dnl result in a second limb vector.
dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/s390_32/submul_1.asm b/gmp/mpn/s390/submul_1.asm
index da7d849d5d..5301096da2 100644
--- a/gmp/mpn/s390_32/submul_1.asm
+++ b/gmp/mpn/s390/submul_1.asm
@@ -4,30 +4,19 @@ dnl result from a second limb vector.
dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/s390_32/copyd.asm b/gmp/mpn/s390_32/copyd.asm
deleted file mode 100644
index ff252bc1a6..0000000000
--- a/gmp/mpn/s390_32/copyd.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-dnl S/390-32 mpn_copyd
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C cycles/limb
-C z900 1.65
-C z990 1.125
-C z9 ?
-C z10 ?
-C z196 ?
-
-C FIXME:
-C * Avoid saving/restoring callee-saves registers for n < 3. This could be
-C done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs.
-C We could then use r3...r10 in main loop.
-
-C INPUT PARAMETERS
-define(`rp_param', `%r2')
-define(`up_param', `%r3')
-define(`n', `%r4')
-
-define(`rp', `%r8')
-define(`up', `%r9')
-
-ASM_START()
-PROLOGUE(mpn_copyd)
- stm %r6, %r11, 24(%r15)
-
- lr %r1, n
- sll %r1, 2
- la %r10, 8(n)
- ahi %r1, -32
- srl %r10, 3
- lhi %r11, -32
-
- la rp, 0(%r1,rp_param) C FIXME use lay on z990 and later
- la up, 0(%r1,up_param) C FIXME use lay on z990 and later
-
- lhi %r7, 7
- nr %r7, n C n mod 8
- chi %r7, 2
- jh L(b34567)
- chi %r7, 1
- je L(b1)
- jh L(b2)
-
-L(b0): brct %r10, L(top)
- j L(end)
-
-L(b1): l %r0, 28(up)
- ahi up, -4
- st %r0, 28(rp)
- ahi rp, -4
- brct %r10, L(top)
- j L(end)
-
-L(b2): lm %r0, %r1, 24(up)
- ahi up, -8
- stm %r0, %r1, 24(rp)
- ahi rp, -8
- brct %r10, L(top)
- j L(end)
-
-L(b34567):
- chi %r7, 4
- jl L(b3)
- je L(b4)
- chi %r7, 6
- je L(b6)
- jh L(b7)
-
-L(b5): lm %r0, %r4, 12(up)
- ahi up, -20
- stm %r0, %r4, 12(rp)
- ahi rp, -20
- brct %r10, L(top)
- j L(end)
-
-L(b3): lm %r0, %r2, 20(up)
- ahi up, -12
- stm %r0, %r2, 20(rp)
- ahi rp, -12
- brct %r10, L(top)
- j L(end)
-
-L(b4): lm %r0, %r3, 16(up)
- ahi up, -16
- stm %r0, %r3, 16(rp)
- ahi rp, -16
- brct %r10, L(top)
- j L(end)
-
-L(b6): lm %r0, %r5, 8(up)
- ahi up, -24
- stm %r0, %r5, 8(rp)
- ahi rp, -24
- brct %r10, L(top)
- j L(end)
-
-L(b7): lm %r0, %r6, 4(up)
- ahi up, -28
- stm %r0, %r6, 4(rp)
- ahi rp, -28
- brct %r10, L(top)
- j L(end)
-
-L(top): lm %r0, %r7, 0(up)
- la up, 0(%r11,up)
- stm %r0, %r7, 0(rp)
- la rp, 0(%r11,rp)
- brct %r10, L(top)
-
-L(end): lm %r6, %r11, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/copyi.asm b/gmp/mpn/s390_32/copyi.asm
deleted file mode 100644
index 1df32f100e..0000000000
--- a/gmp/mpn/s390_32/copyi.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-dnl S/390-32 mpn_copyi
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 0.75
-C z990 0.375
-C z9 ?
-C z10 ?
-C z196 ?
-
-C NOTE
-C * This is based on GNU libc memcpy which was written by Martin Schwidefsky.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-
-ASM_START()
-PROLOGUE(mpn_copyi)
- ltr %r4, %r4
- sll %r4, 2
- je L(rtn)
- ahi %r4, -1
- lr %r5, %r4
- srl %r5, 8
- ltr %r5, %r5 C < 256 bytes to copy?
- je L(1)
-
-L(top): mvc 0(256, rp), 0(up)
- la rp, 256(rp)
- la up, 256(up)
- brct %r5, L(top)
-
-L(1): bras %r5, L(2) C make r5 point to mvc insn
- mvc 0(1, rp), 0(up)
-L(2): ex %r4, 0(%r5) C execute mvc with length ((n-1) mod 256)+1
-L(rtn): br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/addmul_1.asm b/gmp/mpn/s390_32/esame/addmul_1.asm
deleted file mode 100644
index 4375b74ae0..0000000000
--- a/gmp/mpn/s390_32/esame/addmul_1.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-dnl S/390-32 mpn_addmul_1 for systems with MLR instruction
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 18.5
-C z990 10
-C z9 ?
-C z10 ?
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-define(`z', `%r9')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- stm %r9, %r12, 36(%r15)
- lhi %r12, 0 C zero index register
- ahi %r12, 0 C clear carry fla
- lhi %r11, 0 C clear carry limb
- lhi z, 0 C clear carry limb
-
-L(top): l %r1, 0(%r12,up)
- l %r10, 0(%r12,rp)
- mlr %r0, v0
- alcr %r1, %r10
- alcr %r0, z
- alr %r1, %r11
- lr %r11, %r0
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct n, L(top)
-
- lhi %r2, 0
- alcr %r2, %r11
-
- lm %r9, %r12, 36(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/aors_n.asm b/gmp/mpn/s390_32/esame/aors_n.asm
deleted file mode 100644
index 98b0dbc7b0..0000000000
--- a/gmp/mpn/s390_32/esame/aors_n.asm
+++ /dev/null
@@ -1,137 +0,0 @@
-dnl S/390-32 mpn_add_n and mpn_sub_n.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 ?
-C z990 2.75-3 (fast for even n, slow for odd n)
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C * Optimise for small n
-C * Use r0 and save/restore one less register
-C * Using logops_n's v1 inner loop operand order make the loop about 20%
-C faster, at the expense of highly alignment-dependent performance.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_add_n', `
- define(ADSB, al)
- define(ADSBCR, alcr)
- define(ADSBC, alc)
- define(RETVAL,`dnl
- lhi %r2, 0
- alcr %r2, %r2')
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(ADSB, sl)
- define(ADSBCR, slbr)
- define(ADSBC, slb)
- define(RETVAL,`dnl
- slbr %r2, %r2
- lcr %r2, %r2')
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- stm %r6, %r8, 24(%r15)
-
- ahi n, 3
- lhi %r7, 3
- lr %r1, n
- srl %r1, 2
- nr %r7, n C n mod 4
- je L(b1)
- chi %r7, 2
- jl L(b2)
- jne L(b0)
-
-L(b3): lm %r5, %r7, 0(up)
- la up, 12(up)
- ADSB %r5, 0(vp)
- ADSBC %r6, 4(vp)
- ADSBC %r7, 8(vp)
- la vp, 12(vp)
- stm %r5, %r7, 0(rp)
- la rp, 12(rp)
- brct %r1, L(top)
- j L(end)
-
-L(b0): lm %r5, %r8, 0(up) C This redundant insns is no mistake,
- la up, 16(up) C it is needed to make main loop run
- ADSB %r5, 0(vp) C fast for n = 0 (mod 4).
- ADSBC %r6, 4(vp)
- j L(m0)
-
-L(b1): l %r5, 0(up)
- la up, 4(up)
- ADSB %r5, 0(vp)
- la vp, 4(vp)
- st %r5, 0(rp)
- la rp, 4(rp)
- brct %r1, L(top)
- j L(end)
-
-L(b2): lm %r5, %r6, 0(up)
- la up, 8(up)
- ADSB %r5, 0(vp)
- ADSBC %r6, 4(vp)
- la vp, 8(vp)
- stm %r5, %r6, 0(rp)
- la rp, 8(rp)
- brct %r1, L(top)
- j L(end)
-
-L(top): lm %r5, %r8, 0(up)
- la up, 16(up)
- ADSBC %r5, 0(vp)
- ADSBC %r6, 4(vp)
-L(m0): ADSBC %r7, 8(vp)
- ADSBC %r8, 12(vp)
- la vp, 16(vp)
- stm %r5, %r8, 0(rp)
- la rp, 16(rp)
- brct %r1, L(top)
-
-L(end): RETVAL
- lm %r6, %r8, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/aorslsh1_n.asm b/gmp/mpn/s390_32/esame/aorslsh1_n.asm
deleted file mode 100644
index f2b222b121..0000000000
--- a/gmp/mpn/s390_32/esame/aorslsh1_n.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl S/390-32 mpn_addlsh1_n
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 9.25
-C z990 5
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C * Optimise for small n
-C * Compute RETVAL for sublsh1_n less stupidly
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_addlsh1_n',`
- define(ADDSUBC, alr)
- define(ADDSUBE, alcr)
- define(INITCY, `lhi %r13, -1')
- define(RETVAL, `alr %r1, %r13
- lhi %r2, 2
- alr %r2, %r1')
- define(func, mpn_addlsh1_n)
-')
-ifdef(`OPERATION_sublsh1_n',`
- define(ADDSUBC, slr)
- define(ADDSUBE, slbr)
- define(INITCY, `lhi %r13, 0')
- define(RETVAL, `slr %r1, %r13
- lhi %r2, 1
- alr %r2, %r1')
- define(func, mpn_sublsh1_n)
-')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
-
-ASM_START()
-PROLOGUE(func)
- stm %r6, %r13, 24(%r15)
-
- la %r0, 3(n)
- lhi %r7, 3
- srl %r0, 2
- nr %r7, n C n mod 4
- je L(b0)
- chi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): lm %r5, %r7, 0(up)
- la up, 12(up)
- lm %r9, %r11, 0(vp)
- la vp, 12(vp)
-
- alr %r9, %r9
- alcr %r10, %r10
- alcr %r11, %r11
- slbr %r1, %r1
-
- ADDSUBC %r5, %r9
- ADDSUBE %r6, %r10
- ADDSUBE %r7, %r11
- slbr %r13, %r13
-
- stm %r5, %r7, 0(rp)
- la rp, 12(rp)
- brct %r0, L(top)
- j L(end)
-
-L(b0): lhi %r1, -1
- INITCY
- j L(top)
-
-L(b1): l %r5, 0(up)
- la up, 4(up)
- l %r9, 0(vp)
- la vp, 4(vp)
-
- alr %r9, %r9
- slbr %r1, %r1
- ADDSUBC %r5, %r9
- slbr %r13, %r13
-
- st %r5, 0(rp)
- la rp, 4(rp)
- brct %r0, L(top)
- j L(end)
-
-L(b2): lm %r5, %r6, 0(up)
- la up, 8(up)
- lm %r9, %r10, 0(vp)
- la vp, 8(vp)
-
- alr %r9, %r9
- alcr %r10, %r10
- slbr %r1, %r1
-
- ADDSUBC %r5, %r9
- ADDSUBE %r6, %r10
- slbr %r13, %r13
-
- stm %r5, %r6, 0(rp)
- la rp, 8(rp)
- brct %r0, L(top)
- j L(end)
-
-L(top): lm %r9, %r12, 0(vp)
- la vp, 16(vp)
-
- ahi %r1, 1 C restore carry
-
- alcr %r9, %r9
- alcr %r10, %r10
- alcr %r11, %r11
- alcr %r12, %r12
-
- slbr %r1, %r1 C save carry
-
- lm %r5, %r8, 0(up)
- la up, 16(up)
-
- ahi %r13, 1 C restore carry
-
- ADDSUBE %r5, %r9
- ADDSUBE %r6, %r10
- ADDSUBE %r7, %r11
- ADDSUBE %r8, %r12
-
- slbr %r13, %r13
-
- stm %r5, %r8, 0(rp)
- la rp, 16(rp)
- brct %r0, L(top)
-
-L(end):
- RETVAL
- lm %r6, %r13, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm b/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm
deleted file mode 100644
index 568a2a44e8..0000000000
--- a/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,65 +0,0 @@
-dnl S/390-32 mpn_bdiv_dbm1c for systems with MLR instruction.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 14
-C z990 10
-C z9 ?
-C z10 ?
-C z196 ?
-
-C INPUT PARAMETERS
-define(`qp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`bd', `%r5')
-define(`cy', `%r6')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_bdiv_dbm1c)
- stm %r6, %r7, 24(%r15)
- lhi %r7, 0 C zero index register
-
-L(top): l %r1, 0(%r7,up)
- mlr %r0, bd
- slr %r6, %r1
- st %r6, 0(%r7,qp)
- slbr %r6, %r0
- la %r7, 4(%r7)
- brct n, L(top)
-
- lr %r2, %r6
- lm %r6, %r7, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/gmp-mparam.h b/gmp/mpn/s390_32/esame/gmp-mparam.h
deleted file mode 100644
index a805fa1492..0000000000
--- a/gmp/mpn/s390_32/esame/gmp-mparam.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2008-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 4400 MHz IBM z10 running in 32-bit mode */
-/* FFT tuning limit = 15000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.7 */
-
-#define DIVREM_1_NORM_THRESHOLD 3
-#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 8
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 16
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 30
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 56
-
-#define MUL_TOOM22_THRESHOLD 8
-#define MUL_TOOM33_THRESHOLD 59
-#define MUL_TOOM44_THRESHOLD 88
-#define MUL_TOOM6H_THRESHOLD 125
-#define MUL_TOOM8H_THRESHOLD 169
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 55
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 14
-#define SQR_TOOM3_THRESHOLD 90
-#define SQR_TOOM4_THRESHOLD 144
-#define SQR_TOOM6_THRESHOLD 196
-#define SQR_TOOM8_THRESHOLD 309
-
-#define MULMID_TOOM42_THRESHOLD 24
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 10
-
-#define MUL_FFT_MODF_THRESHOLD 252 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 252, 5}, { 9, 6}, { 5, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 15, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \
- { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \
- { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \
- { 39, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \
- { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
- { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 79, 8}, { 319, 9}, \
- { 175, 8}, { 351, 7}, { 703, 6}, { 1407,10}, \
- { 95, 9}, { 191, 8}, { 383, 9}, { 207, 8}, \
- { 415, 7}, { 831, 9}, { 223,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
- { 575, 9}, { 319,10}, { 175, 9}, { 351, 8}, \
- { 703, 7}, { 1407,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415, 8}, { 831,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 575,11}, \
- { 159,10}, { 351, 9}, { 703, 8}, { 1407,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447, 9}, { 895, 8}, { 1791,10}, { 479, 9}, \
- { 959,12}, { 127,11}, { 287,10}, { 575,11}, \
- { 351,10}, { 703, 9}, { 1407,12}, { 191,11}, \
- { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \
- { 1791,11}, { 479,10}, { 959,13}, { 127,12}, \
- { 255,11}, { 575,12}, { 319,11}, { 703,10}, \
- { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \
- { 895,10}, { 1791,11}, { 959,10}, { 1919, 9}, \
- { 3839,13}, { 255,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 895,11}, { 1791,12}, { 959,11}, { 1919,10}, \
- { 3839,14}, { 255,13}, { 511,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,13}, { 895,12}, \
- { 1919,11}, { 3839,10}, { 7679,14}, { 511,13}, \
- { 1023,12}, { 2047,13}, { 1151,12}, { 2431,13}, \
- { 1407,14}, { 767,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 149
-#define MUL_FFT_THRESHOLD 2240
-
-#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 244, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 11, 5}, { 23, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 23, 9}, { 7, 8}, { 15, 7}, \
- { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \
- { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 71, 8}, { 143, 7}, { 287,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \
- { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \
- { 319, 9}, { 175, 8}, { 351, 7}, { 703, 6}, \
- { 1407,10}, { 95, 9}, { 191, 8}, { 383,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \
- { 175, 9}, { 351, 8}, { 703, 7}, { 1407,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \
- { 703, 8}, { 1407,11}, { 191,10}, { 415,11}, \
- { 223,10}, { 447, 9}, { 895, 8}, { 1791,10}, \
- { 479, 9}, { 959,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 287,10}, { 575,11}, { 319,10}, \
- { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \
- { 191,11}, { 415,10}, { 831,11}, { 447,10}, \
- { 895, 9}, { 1791,11}, { 479,13}, { 127,12}, \
- { 255,11}, { 575,12}, { 319,11}, { 703,10}, \
- { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \
- { 895,10}, { 1791,11}, { 959,10}, { 1919,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
- { 1215,10}, { 2431,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 895,11}, { 1791,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,10}, \
- { 5887,13}, { 895,12}, { 1919,11}, { 3839,10}, \
- { 7679,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1151,12}, { 2431,13}, { 1407,12}, { 2943,11}, \
- { 5887,14}, { 767,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 161
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 4392
-
-#define DC_DIV_QR_THRESHOLD 43
-#define DC_DIVAPPR_Q_THRESHOLD 150
-#define DC_BDIV_QR_THRESHOLD 38
-#define DC_BDIV_Q_THRESHOLD 107
-
-#define INV_MULMOD_BNM1_THRESHOLD 14
-#define INV_NEWTON_THRESHOLD 165
-#define INV_APPR_THRESHOLD 149
-
-#define BINV_NEWTON_THRESHOLD 147
-#define REDC_1_TO_REDC_N_THRESHOLD 43
-
-#define MU_DIV_QR_THRESHOLD 777
-#define MU_DIVAPPR_Q_THRESHOLD 942
-#define MUPI_DIV_QR_THRESHOLD 69
-#define MU_BDIV_QR_THRESHOLD 654
-#define MU_BDIV_Q_THRESHOLD 777
-
-#define POWM_SEC_TABLE 3,32,126,692,1486
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 103
-#define HGCD_APPR_THRESHOLD 144
-#define HGCD_REDUCE_THRESHOLD 1437
-#define GCD_DC_THRESHOLD 275
-#define GCDEXT_DC_THRESHOLD 206
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 9
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 999
-
-#define FAC_DSC_THRESHOLD 156
-#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/s390_32/esame/mul_1.asm b/gmp/mpn/s390_32/esame/mul_1.asm
deleted file mode 100644
index 04be963651..0000000000
--- a/gmp/mpn/s390_32/esame/mul_1.asm
+++ /dev/null
@@ -1,66 +0,0 @@
-dnl S/390-32 mpn_mul_1 for systems with MLR instruction
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 14
-C z990 9
-C z9 ?
-C z10 ?
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- stm %r11, %r12, 44(%r15)
- lhi %r12, 0 C zero index register
- ahi %r12, 0 C clear carry flag
- lhi %r11, 0 C clear carry limb
-
-L(top): l %r1, 0(%r12,up)
- mlr %r0, v0
- alcr %r1, %r11
- lr %r11, %r0 C copy high part to carry limb
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct n, L(top)
-
- lhi %r2, 0
- alcr %r2, %r11
-
- lm %r11, %r12, 44(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/mul_basecase.asm b/gmp/mpn/s390_32/esame/mul_basecase.asm
deleted file mode 100644
index 2c8138d8d2..0000000000
--- a/gmp/mpn/s390_32/esame/mul_basecase.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-dnl S/390-32/esame mpn_mul_basecase.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 ?
-C z990 ?
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C * Perhaps add special case for un <= 2.
-C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
-C up by about 10%.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`un', `%r4')
-define(`vp', `%r5')
-define(`vn', `%r6')
-
-define(`zero', `%r8')
-
-ASM_START()
-PROLOGUE(mpn_mul_basecase)
- chi un, 2
- jhe L(ge2)
-
-C un = vn = 1
- l %r1, 0(vp)
- ml %r0, 0(up)
- st %r1, 0(rp)
- st %r0, 4(rp)
- br %r14
-
-L(ge2): C jne L(gen)
-
-
-L(gen):
-C mul_1 =======================================================================
-
- stm %r6, %r12, 24(%r15)
- lhi zero, 0
- ahi un, -1
-
- l %r7, 0(vp)
- l %r11, 0(up)
- lhi %r12, 4 C init index register
- mlr %r10, %r7
- lr %r9, un
- st %r11, 0(rp)
- cr %r15, %r15 C clear carry flag
-
-L(tm): l %r1, 0(%r12,up)
- mlr %r0, %r7
- alcr %r1, %r10
- lr %r10, %r0 C copy high part to carry limb
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct %r9, L(tm)
-
- alcr %r0, zero
- st %r0, 0(%r12,rp)
-
-C addmul_1 loop ===============================================================
-
- ahi vn, -1
- je L(outer_end)
-L(outer_loop):
-
- la rp, 4(rp) C rp += 1
- la vp, 4(vp) C up += 1
- l %r7, 0(vp)
- l %r11, 0(up)
- lhi %r12, 4 C init index register
- mlr %r10, %r7
- lr %r9, un
- al %r11, 0(rp)
- st %r11, 0(rp)
-
-L(tam): l %r1, 0(%r12,up)
- l %r11, 0(%r12,rp)
- mlr %r0, %r7
- alcr %r1, %r11
- alcr %r0, zero
- alr %r1, %r10
- lr %r10, %r0
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct %r9, L(tam)
-
- alcr %r0, zero
- st %r0, 0(%r12,rp)
-
- brct vn, L(outer_loop)
-L(outer_end):
-
- lm %r6, %r12, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/sqr_basecase.asm b/gmp/mpn/s390_32/esame/sqr_basecase.asm
deleted file mode 100644
index dcc13112bf..0000000000
--- a/gmp/mpn/s390_32/esame/sqr_basecase.asm
+++ /dev/null
@@ -1,203 +0,0 @@
-dnl S/390-32 mpn_sqr_basecase.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 ?
-C z990 23
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C * Clean up.
-C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
-C This will ask for basecase handling of n = 3.
-C * Update counters and pointers more straightforwardly, possibly lowering
-C register usage.
-C * Should we use this allocation-free style for more sqr_basecase asm
-C implementations? The only disadvantage is that it requires R != U.
-C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
-C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even
-C more.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-
-define(`zero', `%r8')
-define(`rp_saved', `%r9')
-define(`up_saved', `%r13')
-define(`n_saved', `%r14')
-
-ASM_START()
-PROLOGUE(mpn_sqr_basecase)
- ahi n, -2
- jhe L(ge2)
-
-C n = 1
- l %r5, 0(up)
- mlr %r4, %r5
- st %r5, 0(rp)
- st %r4, 4(rp)
- br %r14
-
-L(ge2): jne L(gen)
-
-C n = 2
- stm %r6, %r8, 24(%r15)
- lhi zero, 0
-
- l %r5, 0(up)
- mlr %r4, %r5 C u0 * u0
- l %r1, 4(up)
- mlr %r0, %r1 C u1 * u1
- st %r5, 0(rp)
-
- l %r7, 0(up)
- ml %r6, 4(up) C u0 * u1
- alr %r7, %r7
- alcr %r6, %r6
- alcr %r0, zero
-
- alr %r4, %r7
- alcr %r1, %r6
- alcr %r0, zero
- st %r4, 4(rp)
- st %r1, 8(rp)
- st %r0, 12(rp)
-
- lm %r6, %r8, 24(%r15)
- br %r14
-
-L(gen):
-C mul_1 =======================================================================
-
- stm %r6, %r14, 24(%r15)
- lhi zero, 0
- lr up_saved, up
- lr rp_saved, rp
- lr n_saved, n
-
- l %r6, 0(up)
- l %r11, 4(up)
- lhi %r12, 8 C init index register
- mlr %r10, %r6
- lr %r5, n
- st %r11, 4(rp)
- cr %r15, %r15 C clear carry flag
-
-L(tm): l %r1, 0(%r12,up)
- mlr %r0, %r6
- alcr %r1, %r10
- lr %r10, %r0 C copy high part to carry limb
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct %r5, L(tm)
-
- alcr %r0, zero
- st %r0, 0(%r12,rp)
-
-C addmul_1 loop ===============================================================
-
- ahi n, -1
- je L(outer_end)
-L(outer_loop):
-
- la rp, 8(rp) C rp += 2
- la up, 4(up) C up += 1
- l %r6, 0(up)
- l %r11, 4(up)
- lhi %r12, 8 C init index register
- mlr %r10, %r6
- lr %r5, n
- al %r11, 4(rp)
- st %r11, 4(rp)
-
-L(tam): l %r1, 0(%r12,up)
- l %r7, 0(%r12,rp)
- mlr %r0, %r6
- alcr %r1, %r7
- alcr %r0, zero
- alr %r1, %r10
- lr %r10, %r0
- st %r1, 0(%r12,rp)
- la %r12, 4(%r12)
- brct %r5, L(tam)
-
- alcr %r0, zero
- st %r0, 0(%r12,rp)
-
- brct n, L(outer_loop)
-L(outer_end):
-
- l %r6, 4(up)
- l %r1, 8(up)
- lr %r7, %r0 C Same as: l %r7, 12(,rp)
- mlr %r0, %r6
- alr %r1, %r7
- alcr %r0, zero
- st %r1, 12(rp)
- st %r0, 16(rp)
-
-C sqr_dia_addlsh1 ============================================================
-
-define(`up', `up_saved')
-define(`rp', `rp_saved')
- la n, 1(n_saved)
-
- l %r1, 0(up)
- mlr %r0, %r1
- st %r1, 0(rp)
-C clr %r15, %r15 C clear carry (already clear per above)
-
-L(top): l %r11, 4(up)
- la up, 4(up)
- l %r6, 4(rp)
- l %r7, 8(rp)
- mlr %r10, %r11
- alcr %r6, %r6
- alcr %r7, %r7
- alcr %r10, zero C propagate carry to high product limb
- alr %r6, %r0
- alcr %r7, %r11
- stm %r6, %r7, 4(rp)
- la rp, 8(rp)
- lr %r0, %r10 C copy carry limb
- brct n, L(top)
-
- alcr %r0, zero
- st %r0, 4(rp)
-
- lm %r6, %r14, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/esame/submul_1.asm b/gmp/mpn/s390_32/esame/submul_1.asm
deleted file mode 100644
index a71e57e230..0000000000
--- a/gmp/mpn/s390_32/esame/submul_1.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-dnl S/390-32 mpn_submul_1 for systems with MLR instruction.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 20
-C z990 11
-C z9 ?
-C z10 ?
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- stm %r9, %r12, 36(%r15)
- lhi %r12, 0
- slr %r11, %r11
-
-L(top): l %r1, 0(%r12, up)
- l %r10, 0(%r12, rp)
- mlr %r0, v0
- slbr %r10, %r1
- slbr %r9, %r9
- slr %r0, %r9 C conditional incr
- slr %r10, %r11
- lr %r11, %r0
- st %r10, 0(%r12, rp)
- la %r12, 4(%r12)
- brct %r4, L(top)
-
- lr %r2, %r11
- slbr %r9, %r9
- slr %r2, %r9
-
- lm %r9, %r12, 36(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/gmp-mparam.h b/gmp/mpn/s390_32/gmp-mparam.h
deleted file mode 100644
index 1aca74a818..0000000000
--- a/gmp/mpn/s390_32/gmp-mparam.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 770 MHz IBM z900 running in 32-bit mode, using just traditional insns */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 5
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 5
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 15
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 19
-#define MUL_TOOM33_THRESHOLD 114
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 226
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
-
-#define SQR_BASECASE_THRESHOLD 7
-#define SQR_TOOM2_THRESHOLD 40
-#define SQR_TOOM3_THRESHOLD 126
-#define SQR_TOOM4_THRESHOLD 192
-#define SQR_TOOM6_THRESHOLD 246
-#define SQR_TOOM8_THRESHOLD 357
-
-#define MULMID_TOOM42_THRESHOLD 28
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 18
-
-#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 8, 5}, { 17, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 33, 8}, { 19, 7}, { 39, 8}, { 23, 7}, \
- { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
- { 63, 9}, { 39, 8}, { 79, 9}, { 47,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 71, 8}, \
- { 143, 9}, { 79,10}, { 47,11}, { 2048,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 48
-#define MUL_FFT_THRESHOLD 2688
-
-#define SQR_FFT_MODF_THRESHOLD 216 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 216, 5}, { 7, 4}, { 15, 5}, { 17, 6}, \
- { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
- { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
- { 63, 9}, { 39, 8}, { 79, 9}, { 47,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 71, 8}, \
- { 143, 9}, { 79,10}, { 47,11}, { 2048,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 44
-#define SQR_FFT_THRESHOLD 1856
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 61
-#define MULLO_MUL_N_THRESHOLD 5240
-
-#define DC_DIV_QR_THRESHOLD 70
-#define DC_DIVAPPR_Q_THRESHOLD 234
-#define DC_BDIV_QR_THRESHOLD 59
-#define DC_BDIV_Q_THRESHOLD 137
-
-#define INV_MULMOD_BNM1_THRESHOLD 36
-#define INV_NEWTON_THRESHOLD 327
-#define INV_APPR_THRESHOLD 268
-
-#define BINV_NEWTON_THRESHOLD 324
-#define REDC_1_TO_REDC_N_THRESHOLD 63
-
-#define MU_DIV_QR_THRESHOLD 1099
-#define MU_DIVAPPR_Q_THRESHOLD 1360
-#define MUPI_DIV_QR_THRESHOLD 138
-#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1234
-
-#define MATRIX22_STRASSEN_THRESHOLD 18
-#define HGCD_THRESHOLD 167
-#define GCD_DC_THRESHOLD 518
-#define GCDEXT_DC_THRESHOLD 378
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 577
-#define SET_STR_PRECOMPUTE_THRESHOLD 1217
diff --git a/gmp/mpn/s390_32/logops_n.asm b/gmp/mpn/s390_32/logops_n.asm
deleted file mode 100644
index 1f2cd2a8f6..0000000000
--- a/gmp/mpn/s390_32/logops_n.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-dnl S/390-32 logops.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb variant 1 variant 2 variant 3
-C rp!=up rp=up
-C z900 ? ? ? ?
-C z990 2.5 1 2.75 2.75
-C z9 ? ? ?
-C z10 ? ? ?
-C z196 ? ? ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`nn', `%r5')
-
-ifdef(`OPERATION_and_n',`
- define(`func',`mpn_and_n')
- define(`VARIANT_1')
- define(`LOGOPC',`nc')
- define(`LOGOP',`n')')
-ifdef(`OPERATION_andn_n',`
- define(`func',`mpn_andn_n')
- define(`VARIANT_2')
- define(`LOGOP',`n')')
-ifdef(`OPERATION_nand_n',`
- define(`func',`mpn_nand_n')
- define(`VARIANT_3')
- define(`LOGOP',`n')')
-ifdef(`OPERATION_ior_n',`
- define(`func',`mpn_ior_n')
- define(`VARIANT_1')
- define(`LOGOPC',`oc')
- define(`LOGOP',`o')')
-ifdef(`OPERATION_iorn_n',`
- define(`func',`mpn_iorn_n')
- define(`VARIANT_2')
- define(`LOGOP',`o')')
-ifdef(`OPERATION_nior_n',`
- define(`func',`mpn_nior_n')
- define(`VARIANT_3')
- define(`LOGOP',`o')')
-ifdef(`OPERATION_xor_n',`
- define(`func',`mpn_xor_n')
- define(`VARIANT_1')
- define(`LOGOPC',`xc')
- define(`LOGOP',`x')')
-ifdef(`OPERATION_xnor_n',`
- define(`func',`mpn_xnor_n')
- define(`VARIANT_2')
- define(`LOGOP',`x')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-ASM_START()
-PROLOGUE(func)
-ifdef(`VARIANT_1',`
- cr rp, up
- jne L(normal)
-
- sll nn, 2
- ahi nn, -1
- lr %r1, nn
- srl %r1, 8
- ltr %r1, %r1 C < 256 bytes to copy?
- je L(1)
-
-L(tp): LOGOPC 0(256, rp), 0(vp)
- la rp, 256(rp)
- la vp, 256(vp)
- brct %r1, L(tp)
-
-L(1): bras %r1, L(2) C make r1 point to mvc insn
- LOGOPC 0(1, rp), 0(vp)
-L(2): ex nn, 0(%r1) C execute mvc with length ((nn-1) mod 256)+1
-L(rtn): br %r14
-
-
-L(normal):
- stm %r6, %r8, 12(%r15)
- ahi nn, 3
- lhi %r7, 3
- lr %r0, nn
- srl %r0, 2
- nr %r7, nn C nn mod 4
- je L(b1)
- chi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lm %r5, %r7, 0(up)
- la up, 12(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 4(vp)
- LOGOP %r7, 8(vp)
- stm %r5, %r7, 0(rp)
- la rp, 12(rp)
- la vp, 12(vp)
- j L(mid)
-
-L(b1): l %r5, 0(up)
- la up, 4(up)
- LOGOP %r5, 0(vp)
- st %r5, 0(rp)
- la rp, 4(rp)
- la vp, 4(vp)
- j L(mid)
-
-L(b2): lm %r5, %r6, 0(up)
- la up, 8(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 4(vp)
- stm %r5, %r6, 0(rp)
- la rp, 8(rp)
- la vp, 8(vp)
- j L(mid)
-
-L(top): lm %r5, %r8, 0(up)
- la up, 16(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 4(vp)
- LOGOP %r7, 8(vp)
- LOGOP %r8, 12(vp)
- stm %r5, %r8, 0(rp)
- la rp, 16(rp)
- la vp, 16(vp)
-L(mid): brct %r0, L(top)
-
- lm %r6, %r8, 12(%r15)
- br %r14
-')
-
-ifdef(`VARIANT_2',`
- stm %r6, %r8, 12(%r15)
- lhi %r1, -1
-
- ahi nn, 3
- lhi %r7, 3
- lr %r0, nn
- srl %r0, 2
- nr %r7, nn C nn mod 4
- je L(b1)
- chi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lm %r5, %r7, 0(vp)
- la vp, 12(vp)
- xr %r5, %r1
- xr %r6, %r1
- xr %r7, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- LOGOP %r7, 8(up)
- stm %r5, %r7, 0(rp)
- la rp, 12(rp)
- la up, 12(up)
- j L(mid)
-
-L(b1): l %r5, 0(vp)
- la vp, 4(vp)
- xr %r5, %r1
- LOGOP %r5, 0(up)
- st %r5, 0(rp)
- la rp, 4(rp)
- la up, 4(up)
- j L(mid)
-
-L(b2): lm %r5, %r6, 0(vp)
- la vp, 8(vp)
- xr %r5, %r1
- xr %r6, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- stm %r5, %r6, 0(rp)
- la rp, 8(rp)
- la up, 8(up)
- j L(mid)
-
-L(top): lm %r5, %r8, 0(vp)
- la vp, 16(vp)
- xr %r5, %r1
- xr %r6, %r1
- xr %r7, %r1
- xr %r8, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- LOGOP %r7, 8(up)
- LOGOP %r8, 12(up)
- la up, 16(up)
- stm %r5, %r8, 0(rp)
- la rp, 16(rp)
-L(mid): brct %r0, L(top)
-
- lm %r6, %r8, 12(%r15)
- br %r14
-')
-
-ifdef(`VARIANT_3',`
- stm %r6, %r8, 12(%r15)
- lhi %r1, -1
-
- ahi nn, 3
- lhi %r7, 3
- lr %r0, nn
- srl %r0, 2
- nr %r7, nn C nn mod 4
- je L(b1)
- chi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lm %r5, %r7, 0(vp)
- la vp, 12(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- xr %r5, %r1
- xr %r6, %r1
- LOGOP %r7, 8(up)
- xr %r7, %r1
- stm %r5, %r7, 0(rp)
- la rp, 12(rp)
- la up, 12(up)
- j L(mid)
-
-L(b1): l %r5, 0(vp)
- la vp, 4(vp)
- LOGOP %r5, 0(up)
- xr %r5, %r1
- st %r5, 0(rp)
- la rp, 4(rp)
- la up, 4(up)
- j L(mid)
-
-L(b2): lm %r5, %r6, 0(vp)
- la vp, 8(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- xr %r5, %r1
- xr %r6, %r1
- stm %r5, %r6, 0(rp)
- la rp, 8(rp)
- la up, 8(up)
- j L(mid)
-
-L(top): lm %r5, %r8, 0(vp)
- la vp, 16(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 4(up)
- xr %r5, %r1
- xr %r6, %r1
- LOGOP %r7, 8(up)
- LOGOP %r8, 12(up)
- xr %r7, %r1
- xr %r8, %r1
- stm %r5, %r8, 0(rp)
- la up, 16(up)
- la rp, 16(rp)
-L(mid): brct %r0, L(top)
-
- lm %r6, %r8, 12(%r15)
- br %r14
-')
-
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/lshift.asm b/gmp/mpn/s390_32/lshift.asm
deleted file mode 100644
index da7d76e844..0000000000
--- a/gmp/mpn/s390_32/lshift.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl S/390-32 mpn_lshift.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 6
-C z990 3
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C *
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_lshift)
- lr %r1, n
- sll %r1, 2
- stm %r6, %r12, 24(%r15)
- la up, 0(%r1,up) C put up near end of U
- la rp, 0(%r1,rp) C put rp near end of R
- ahi up, -20
- ahi rp, -16
- lhi %r8, 32
- sr %r8, cnt
- l %r12, 16(up)
- srl %r12, 0(%r8) C return value
- lhi %r7, 3
- nr %r7, n
- srl n, 2
- je L(b0)
- chi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): l %r10, 16(up)
- l %r11, 12(up)
- l %r9, 8(up)
- ahi up, -8
- lr %r8, %r11
- sldl %r10, 0(cnt)
- sldl %r8, 0(cnt)
- st %r10, 12(rp)
- st %r8, 8(rp)
- ahi rp, -8
- ltr n, n
- je L(end)
- j L(top)
-
-L(b2): l %r10, 16(up)
- l %r11, 12(up)
- ahi up, -4
- sldl %r10, 0(cnt)
- st %r10, 12(rp)
- ahi rp, -4
- ltr n, n
- je L(end)
- j L(top)
-
-L(b1): ltr n, n
- je L(end)
- j L(top)
-
-L(b0): l %r10,16(up)
- l %r8, 12(up)
- l %r6, 8(up)
- l %r0, 4(up)
- ahi up, -12
- lr %r11, %r8
- lr %r9, %r6
- lr %r7, %r0
- sldl %r10,0(cnt)
- sldl %r8, 0(cnt)
- sldl %r6, 0(cnt)
- st %r10, 12(rp)
- st %r8, 8(rp)
- st %r6, 4(rp)
- ahi rp, -12
- ahi n, -1
- je L(end)
-
- ALIGN(8)
-L(top): l %r10, 16(up)
- l %r8, 12(up)
- l %r6, 8(up)
- l %r0, 4(up)
- l %r1, 0(up)
- lr %r11, %r8
- lr %r9, %r6
- lr %r7, %r0
- ahi up, -16
- sldl %r10, 0(cnt)
- sldl %r8, 0(cnt)
- sldl %r6, 0(cnt)
- sldl %r0, 0(cnt)
- st %r10, 12(rp)
- st %r8, 8(rp)
- st %r6, 4(rp)
- st %r0, 0(rp)
- ahi rp, -16
- brct n, L(top)
-
-L(end): l %r10, 16(up)
- sll %r10, 0(cnt)
- st %r10, 12(rp)
-
- lr %r2, %r12
- lm %r6, %r12, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/lshiftc.asm b/gmp/mpn/s390_32/lshiftc.asm
deleted file mode 100644
index f601673249..0000000000
--- a/gmp/mpn/s390_32/lshiftc.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-dnl S/390-32 mpn_lshiftc.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 7
-C z990 3.375
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C *
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- lr %r1, n
- sll %r1, 2
- stm %r6, %r13, 24(%r15)
- la up, 0(%r1,up) C put up near end of U
- la rp, 0(%r1,rp) C put rp near end of R
- ahi up, -20
- ahi rp, -16
- lhi %r8, 32
- sr %r8, cnt
- l %r12, 16(up)
- srl %r12, 0(%r8) C return value
- lhi %r13, -1
- lhi %r7, 3
- nr %r7, n
- srl n, 2
- je L(b0)
- chi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): l %r10, 16(up)
- l %r11, 12(up)
- l %r9, 8(up)
- ahi up, -8
- lr %r8, %r11
- sldl %r10, 0(cnt)
- sldl %r8, 0(cnt)
- xr %r10, %r13
- xr %r8, %r13
- st %r10, 12(rp)
- st %r8, 8(rp)
- ahi rp, -8
- ltr n, n
- je L(end)
- j L(top)
-
-L(b2): l %r10, 16(up)
- l %r11, 12(up)
- ahi up, -4
- sldl %r10, 0(cnt)
- xr %r10, %r13
- st %r10, 12(rp)
- ahi rp, -4
- ltr n, n
- je L(end)
- j L(top)
-
-L(b1): ltr n, n
- je L(end)
- j L(top)
-
-L(b0): l %r10,16(up)
- l %r8, 12(up)
- l %r6, 8(up)
- l %r0, 4(up)
- ahi up, -12
- lr %r11, %r8
- lr %r9, %r6
- lr %r7, %r0
- sldl %r10,0(cnt)
- sldl %r8, 0(cnt)
- sldl %r6, 0(cnt)
- xr %r10, %r13
- xr %r8, %r13
- xr %r6, %r13
- st %r10, 12(rp)
- st %r8, 8(rp)
- st %r6, 4(rp)
- ahi rp, -12
- ahi n, -1
- je L(end)
-
- ALIGN(8)
-L(top): l %r10, 16(up)
- l %r8, 12(up)
- l %r6, 8(up)
- l %r0, 4(up)
- l %r1, 0(up)
- lr %r11, %r8
- lr %r9, %r6
- lr %r7, %r0
- ahi up, -16
- sldl %r10, 0(cnt)
- sldl %r8, 0(cnt)
- sldl %r6, 0(cnt)
- sldl %r0, 0(cnt)
- xr %r10, %r13
- xr %r8, %r13
- xr %r6, %r13
- xr %r0, %r13
- st %r10, 12(rp)
- st %r8, 8(rp)
- st %r6, 4(rp)
- st %r0, 0(rp)
- ahi rp, -16
- brct n, L(top)
-
-L(end): l %r10, 16(up)
- sll %r10, 0(cnt)
- xr %r10, %r13
- st %r10, 12(rp)
-
- lr %r2, %r12
- lm %r6, %r13, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_32/rshift.asm b/gmp/mpn/s390_32/rshift.asm
deleted file mode 100644
index 5f2cf37ca0..0000000000
--- a/gmp/mpn/s390_32/rshift.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-dnl S/390-32 mpn_rshift.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 6
-C z990 3
-C z9 ?
-C z10 ?
-C z196 ?
-
-C TODO
-C *
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_rshift)
- stm %r6, %r12, 24(%r15)
- lhi %r8, 32
- sr %r8, cnt
- l %r12, 0(up)
- sll %r12, 0(%r8) C return value
- lhi %r7, 3
- nr %r7, n
- srl n, 2
- je L(b0)
- chi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): l %r11, 0(up)
- l %r10, 4(up)
- l %r8, 8(up)
- ahi up, 8
- lr %r9, %r10
- srdl %r10, 0(cnt)
- srdl %r8, 0(cnt)
- st %r11, 0(rp)
- st %r9, 4(rp)
- ahi rp, 8
- ltr n, n
- je L(end)
- j L(top)
-
-L(b2): l %r11, 0(up)
- l %r10, 4(up)
- ahi up, 4
- srdl %r10, 0(cnt)
- st %r11, 0(rp)
- ahi rp, 4
- ltr n, n
- je L(end)
- j L(top)
-
-L(b1): ltr n, n
- je L(end)
- j L(top)
-
-L(b0): l %r11, 0(up)
- l %r9, 4(up)
- l %r7, 8(up)
- l %r1, 12(up)
- ahi up, 12
- lr %r10, %r9
- lr %r8, %r7
- lr %r6, %r1
- srdl %r10, 0(cnt)
- srdl %r8, 0(cnt)
- srdl %r6, 0(cnt)
- st %r11, 0(rp)
- st %r9, 4(rp)
- st %r7, 8(rp)
- ahi rp, 12
- ahi n, -1
- je L(end)
-
- ALIGN(8)
-L(top): l %r11, 0(up)
- l %r9, 4(up)
- l %r7, 8(up)
- l %r1, 12(up)
- l %r0, 16(up)
- lr %r10, %r9
- lr %r8, %r7
- lr %r6, %r1
- ahi up, 16
- srdl %r10, 0(cnt)
- srdl %r8, 0(cnt)
- srdl %r6, 0(cnt)
- srdl %r0, 0(cnt)
- st %r11, 0(rp)
- st %r9, 4(rp)
- st %r7, 8(rp)
- st %r1, 12(rp)
- ahi rp, 16
- brct n, L(top)
-
-L(end): l %r11, 0(up)
- srl %r11, 0(cnt)
- st %r11, 0(rp)
-
- lr %r2, %r12
- lm %r6, %r12, 24(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/README b/gmp/mpn/s390_64/README
deleted file mode 100644
index 8f482a9cd2..0000000000
--- a/gmp/mpn/s390_64/README
+++ /dev/null
@@ -1,88 +0,0 @@
-Copyright 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
-
-
-
-There are 5 generations of 64-but s390 processors, z900, z990, z9,
-z10, and z196. The current GMP code was optimised for the two oldest,
-z900 and z990.
-
-
-mpn_copyi
-
-This code makes use of a loop around MVC. It almost surely runs very
-close to optimally. A small improvement could be done by using one
-MVC for size 256 bytes, now we use two (we use an extra MVC when
-copying any multiple of 256 bytes).
-
-
-mpn_copyd
-
-We have tried several feed-in variants here, branch tree, jump table
-and computed goto. The fastest (on z990) turned out to be computed
-goto.
-
-An approach not tried is EX of LMG and STMG, modifying the register set
-on-the-fly. Using that trick, we could completely avoid using
-separate feed-in paths.
-
-
-mpn_lshift, mpn_rshift
-
-The current code runs at pipeline decode bandwith on z990.
-
-
-mpn_add_n, mpn_sub_n
-
-The current code is 4-way unrolled. It should be unrolled more, at
-least 8x, in order to reach 2.5 c/l.
-
-
-mpn_mul_1, mpn_addmul_1, mpn_submul_1
-
-The current code is very naive, but due to the non-pipelined nature of
-MLGR on z900 and z990, more sophisticated code would not gain much.
-
-On z10 one would need to cluster at least 4 MLGR together, in order to
-reduce stalling.
-
-On z196, one surely want to use unrolling and pipelining, to perhaps
-reach around 12 c/l. A major issue here and on z10 is ALCGR's 3 cycle
-stalling.
-
-
-mpn_mul_2, mpn_addmul_2
-
-At least for older machines (z900, z990) with very slow MLGR, we
-should use Karatsuba's algorithm on 2-limb units, making mul_2 and
-addmul_2 the main multiplication primitives. The newer machines might
-benefit less from this approach, perhaps in particular z10, where MLGR
-clustering is more important.
-
-With Karatsuba, one could hope for around 16 cycles per accumulated
-128 cross product, on z990.
diff --git a/gmp/mpn/s390_64/addmul_1.asm b/gmp/mpn/s390_64/addmul_1.asm
deleted file mode 100644
index 84cca12361..0000000000
--- a/gmp/mpn/s390_64/addmul_1.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-dnl S/390-64 mpn_addmul_1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 34
-C z990 23
-C z9 ?
-C z10 28
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-define(`z', `%r9')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- stmg %r9, %r12, 72(%r15)
- lghi %r12, 0 C zero index register
- aghi %r12, 0 C clear carry flag
- lghi %r11, 0 C clear carry limb
- lghi z, 0 C keep register zero
-
-L(top): lg %r1, 0(%r12,up)
- lg %r10, 0(%r12,rp)
- mlgr %r0, v0
- alcgr %r1, %r10
- alcgr %r0, z
- algr %r1, %r11
- lgr %r11, %r0
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg n, L(top)
-
- lghi %r2, 0
- alcgr %r2, %r11
-
- lmg %r9, %r12, 72(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/aorrlsh1_n.asm b/gmp/mpn/s390_64/aorrlsh1_n.asm
deleted file mode 100644
index 697259efef..0000000000
--- a/gmp/mpn/s390_64/aorrlsh1_n.asm
+++ /dev/null
@@ -1,168 +0,0 @@
-dnl S/390-64 mpn_addlsh1_n and mpn_rsblsh1_n.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 9
-C z990 4.75
-C z9 ?
-C z10 11
-C z196 ?
-
-C TODO
-C * Optimise for small n, avoid 'la' like in aors_n.asm.
-C * Tune to reach 3.5 c/l. For addlsh1, we could let the main alcgr propagate
-C carry to the lsh1 alcgr.
-C * Compute RETVAL for sublsh1_n less stupidly.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_addlsh1_n',`
- define(ADSB, alg)
- define(ADSBC, alcg)
- define(INITCY, `lghi %r9, -1')
- define(RETVAL, `la %r2, 2(%r1,%r9)')
- define(func, mpn_addlsh1_n)
-')
-ifdef(`OPERATION_rsblsh1_n',`
- define(ADSB, slg)
- define(ADSBC, slbg)
- define(INITCY, `lghi %r9, 0')
- define(RETVAL,`dnl
- algr %r1, %r9
- lghi %r2, 1
- algr %r2, %r1')
- define(func, mpn_rsblsh1_n)
-')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
-
-ASM_START()
-PROLOGUE(func)
- stmg %r6, %r9, 48(%r15)
-
- aghi n, 3
- lghi %r7, 3
- srlg %r0, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(b0)
-
-L(b3): lmg %r5, %r7, 0(vp)
- la vp, 24(vp)
-
- algr %r5, %r5
- alcgr %r6, %r6
- alcgr %r7, %r7
- slbgr %r1, %r1
-
- ADSB %r5, 0(up)
- ADSBC %r6, 8(up)
- ADSBC %r7, 16(up)
- la up, 24(up)
- slbgr %r9, %r9
-
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(b0): lghi %r1, -1
- INITCY
- j L(top)
-
-L(b1): lg %r5, 0(vp)
- la vp, 8(vp)
-
- algr %r5, %r5
- slbgr %r1, %r1
- ADSB %r5, 0(up)
- la up, 8(up)
- slbgr %r9, %r9
-
- stg %r5, 0(rp)
- la rp, 8(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(b2): lmg %r5, %r6, 0(vp)
- la vp, 16(vp)
-
- algr %r5, %r5
- alcgr %r6, %r6
- slbgr %r1, %r1
-
- ADSB %r5, 0(up)
- ADSBC %r6, 8(up)
- la up, 16(up)
- slbgr %r9, %r9
-
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(top): lmg %r5, %r8, 0(vp)
- la vp, 32(vp)
-
- aghi %r1, 1 C restore carry
-
- alcgr %r5, %r5
- alcgr %r6, %r6
- alcgr %r7, %r7
- alcgr %r8, %r8
-
- slbgr %r1, %r1 C save carry
-
- aghi %r9, 1 C restore carry
-
- ADSBC %r5, 0(up)
- ADSBC %r6, 8(up)
- ADSBC %r7, 16(up)
- ADSBC %r8, 24(up)
- la up, 32(up)
-
- slbgr %r9, %r9 C save carry
-
- stmg %r5, %r8, 0(rp)
- la rp, 32(rp)
- brctg %r0, L(top)
-
-L(end): RETVAL
- lmg %r6, %r9, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/aors_n.asm b/gmp/mpn/s390_64/aors_n.asm
deleted file mode 100644
index a3c3ca791c..0000000000
--- a/gmp/mpn/s390_64/aors_n.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-dnl S/390-64 mpn_add_n and mpn_sub_n.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 5.5
-C z990 3
-C z9 ?
-C z10 6
-C z196 ?
-
-C TODO
-C * Optimise for small n
-C * Use r0 and save/restore one less register
-C * Using logops_n's v1 inner loop operand order make the loop about 20%
-C faster, at the expense of highly alignment-dependent performance.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_add_n', `
- define(ADSB, alg)
- define(ADSBCR, alcgr)
- define(ADSBC, alcg)
- define(RETVAL,`dnl
- lghi %r2, 0
- alcgr %r2, %r2')
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(ADSB, slg)
- define(ADSBCR, slbgr)
- define(ADSBC, slbg)
- define(RETVAL,`dnl
- slbgr %r2, %r2
- lcgr %r2, %r2')
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
-
-ASM_START()
-PROLOGUE(func)
- stmg %r6, %r8, 48(%r15)
-
- aghi n, 3
- lghi %r7, 3
- srlg %r1, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(b0)
-
-L(b3): lmg %r5, %r7, 0(up)
- la up, 24(up)
- ADSB %r5, 0(vp)
- ADSBC %r6, 8(vp)
- ADSBC %r7, 16(vp)
- la vp, 24(vp)
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- brctg %r1, L(top)
- j L(end)
-
-L(b0): lmg %r5, %r8, 0(up) C This redundant insns is no mistake,
- la up, 32(up) C it is needed to make main loop run
- ADSB %r5, 0(vp) C fast for n = 0 (mod 4).
- ADSBC %r6, 8(vp)
- j L(m0)
-
-L(b1): lg %r5, 0(up)
- la up, 8(up)
- ADSB %r5, 0(vp)
- la vp, 8(vp)
- stg %r5, 0(rp)
- la rp, 8(rp)
- brctg %r1, L(top)
- j L(end)
-
-L(b2): lmg %r5, %r6, 0(up)
- la up, 16(up)
- ADSB %r5, 0(vp)
- ADSBC %r6, 8(vp)
- la vp, 16(vp)
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- brctg %r1, L(top)
- j L(end)
-
-L(top): lmg %r5, %r8, 0(up)
- la up, 32(up)
- ADSBC %r5, 0(vp)
- ADSBC %r6, 8(vp)
-L(m0): ADSBC %r7, 16(vp)
- ADSBC %r8, 24(vp)
- la vp, 32(vp)
- stmg %r5, %r8, 0(rp)
- la rp, 32(rp)
- brctg %r1, L(top)
-
-L(end): RETVAL
- lmg %r6, %r8, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/bdiv_dbm1c.asm b/gmp/mpn/s390_64/bdiv_dbm1c.asm
deleted file mode 100644
index 35e900a279..0000000000
--- a/gmp/mpn/s390_64/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,65 +0,0 @@
-dnl S/390-64 mpn_bdiv_dbm1c
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 29
-C z990 22
-C z9 ?
-C z10 19
-C z196 ?
-
-C INPUT PARAMETERS
-define(`qp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`bd', `%r5')
-define(`cy', `%r6')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_bdiv_dbm1c)
- stmg %r6, %r7, 48(%r15)
- lghi %r7, 0 C zero index register
-
-L(top): lg %r1, 0(%r7,up)
- mlgr %r0, bd
- slgr %r6, %r1
- stg %r6, 0(%r7,qp)
- la %r7, 8(%r7)
- slbgr %r6, %r0
- brctg n, L(top)
-
- lgr %r2, %r6
- lmg %r6, %r7, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/copyd.asm b/gmp/mpn/s390_64/copyd.asm
deleted file mode 100644
index 8631e19f00..0000000000
--- a/gmp/mpn/s390_64/copyd.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl S/390-64 mpn_copyd
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 2.67
-C z990 1.5
-C z9 ?
-C z10 1.8
-C z196 ?
-
-C FIXME:
-C * Avoid saving/restoring callee-saves registers for n < 3. This could be
-C done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs.
-C We could then use r3...r10 in main loop.
-C * Could we use some EX trick, modifying lmg/stmg, for the feed-in code?
-
-C INPUT PARAMETERS
-define(`rp_param', `%r2')
-define(`up_param', `%r3')
-define(`n', `%r4')
-
-define(`rp', `%r8')
-define(`up', `%r9')
-
-ASM_START()
-PROLOGUE(mpn_copyd)
- stmg %r6, %r11, 48(%r15)
-
- sllg %r1, n, 3
- la %r10, 8(n)
- aghi %r1, -64
- srlg %r10, %r10, 3
- lghi %r11, -64
-
- la rp, 0(%r1,rp_param) C FIXME use lay on z990 and later
- la up, 0(%r1,up_param) C FIXME use lay on z990 and later
-
- lghi %r7, 7
- ngr %r7, n C n mod 8
- cghi %r7, 2
- jh L(b34567)
- cghi %r7, 1
- je L(b1)
- jh L(b2)
-
-L(b0): brctg %r10, L(top)
- j L(end)
-
-L(b1): lg %r0, 56(up)
- aghi up, -8
- stg %r0, 56(rp)
- aghi rp, -8
- brctg %r10, L(top)
- j L(end)
-
-L(b2): lmg %r0, %r1, 48(up)
- aghi up, -16
- stmg %r0, %r1, 48(rp)
- aghi rp, -16
- brctg %r10, L(top)
- j L(end)
-
-L(b34567):
- cghi %r7, 4
- jl L(b3)
- je L(b4)
- cghi %r7, 6
- je L(b6)
- jh L(b7)
-
-L(b5): lmg %r0, %r4, 24(up)
- aghi up, -40
- stmg %r0, %r4, 24(rp)
- aghi rp, -40
- brctg %r10, L(top)
- j L(end)
-
-L(b3): lmg %r0, %r2, 40(up)
- aghi up, -24
- stmg %r0, %r2, 40(rp)
- aghi rp, -24
- brctg %r10, L(top)
- j L(end)
-
-L(b4): lmg %r0, %r3, 32(up)
- aghi up, -32
- stmg %r0, %r3, 32(rp)
- aghi rp, -32
- brctg %r10, L(top)
- j L(end)
-
-L(b6): lmg %r0, %r5, 16(up)
- aghi up, -48
- stmg %r0, %r5, 16(rp)
- aghi rp, -48
- brctg %r10, L(top)
- j L(end)
-
-L(b7): lmg %r0, %r6, 8(up)
- aghi up, -56
- stmg %r0, %r6, 8(rp)
- aghi rp, -56
- brctg %r10, L(top)
- j L(end)
-
-L(top): lmg %r0, %r7, 0(up)
- la up, 0(%r11,up)
- stmg %r0, %r7, 0(rp)
- la rp, 0(%r11,rp)
- brctg %r10, L(top)
-
-L(end): lmg %r6, %r11, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/copyi.asm b/gmp/mpn/s390_64/copyi.asm
deleted file mode 100644
index bfb88814ea..0000000000
--- a/gmp/mpn/s390_64/copyi.asm
+++ /dev/null
@@ -1,68 +0,0 @@
-dnl S/390-64 mpn_copyi
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 1.25
-C z990 0.75
-C z9 ?
-C z10 1
-C z196 ?
-
-C NOTE
-C * This is based on GNU libc memcpy which was written by Martin Schwidefsky.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-
-ASM_START()
-PROLOGUE(mpn_copyi)
- ltgr %r4, %r4
- sllg %r4, %r4, 3
- je L(rtn)
- aghi %r4, -1
- srlg %r5, %r4, 8
- ltgr %r5, %r5 C < 256 bytes to copy?
- je L(1)
-
-L(top): mvc 0(256, rp), 0(up)
- la rp, 256(rp)
- la up, 256(up)
- brctg %r5, L(top)
-
-L(1): bras %r5, L(2) C make r5 point to mvc insn
- mvc 0(1, rp), 0(up)
-L(2): ex %r4, 0(%r5) C execute mvc with length ((n-1) mod 256)+1
-L(rtn): br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/gmp-mparam.h b/gmp/mpn/s390_64/gmp-mparam.h
deleted file mode 100644
index dacd9966a4..0000000000
--- a/gmp/mpn/s390_64/gmp-mparam.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 1200 MHz z990 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 62
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 98
-
-#define MUL_TOOM22_THRESHOLD 10
-#define MUL_TOOM33_THRESHOLD 41
-#define MUL_TOOM44_THRESHOLD 105
-#define MUL_TOOM6H_THRESHOLD 149
-#define MUL_TOOM8H_THRESHOLD 212
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 72
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 55
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 16
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 153
-#define SQR_TOOM6_THRESHOLD 204
-#define SQR_TOOM8_THRESHOLD 309
-
-#define MULMID_TOOM42_THRESHOLD 20
-
-#define MULMOD_BNM1_THRESHOLD 10
-#define SQRMOD_BNM1_THRESHOLD 11
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
- { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
- { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \
- { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287,10}, { 79,11}, { 47,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
- { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \
- { 319,10}, { 175, 9}, { 351, 8}, { 703, 7}, \
- { 1407,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207,11}, { 111,10}, { 223,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \
- { 287, 9}, { 575, 8}, { 1151,10}, { 319,11}, \
- { 175,10}, { 351, 9}, { 703,12}, { 95,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
- { 415,11}, { 223,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 97
-#define MUL_FFT_THRESHOLD 1728
-
-#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 212, 5}, { 7, 4}, { 15, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 15, 7}, { 8, 6}, \
- { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
- { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \
- { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \
- { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \
- { 19, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 39,10}, { 23,11}, { 15,10}, { 31, 9}, \
- { 63,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287,10}, { 79,11}, { 47,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
- { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \
- { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \
- { 703,10}, { 191, 9}, { 383, 8}, { 767,10}, \
- { 207, 9}, { 415,11}, { 111,10}, { 223,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1151,11}, \
- { 159,10}, { 319,11}, { 175,10}, { 351, 9}, \
- { 703,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415,11}, { 223,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 93
-#define SQR_FFT_THRESHOLD 1600
-
-#define MULLO_BASECASE_THRESHOLD 2
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 3176
-
-#define DC_DIV_QR_THRESHOLD 28
-#define DC_DIVAPPR_Q_THRESHOLD 107
-#define DC_BDIV_QR_THRESHOLD 31
-#define DC_BDIV_Q_THRESHOLD 78
-
-#define INV_MULMOD_BNM1_THRESHOLD 43
-#define INV_NEWTON_THRESHOLD 129
-#define INV_APPR_THRESHOLD 117
-
-#define BINV_NEWTON_THRESHOLD 149
-#define REDC_1_TO_REDC_N_THRESHOLD 38
-
-#define MU_DIV_QR_THRESHOLD 748
-#define MU_DIVAPPR_Q_THRESHOLD 748
-#define MUPI_DIV_QR_THRESHOLD 65
-#define MU_BDIV_QR_THRESHOLD 562
-#define MU_BDIV_Q_THRESHOLD 734
-
-#define POWM_SEC_TABLE 4,23,274,961,2783
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 79
-#define HGCD_APPR_THRESHOLD 70
-#define HGCD_REDUCE_THRESHOLD 1094
-#define GCD_DC_THRESHOLD 183
-#define GCDEXT_DC_THRESHOLD 148
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 30
-#define GET_STR_PRECOMPUTE_THRESHOLD 41
-#define SET_STR_DC_THRESHOLD 402
-#define SET_STR_PRECOMPUTE_THRESHOLD 1104
-
-#define FAC_DSC_THRESHOLD 842
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/s390_64/invert_limb.asm b/gmp/mpn/s390_64/invert_limb.asm
deleted file mode 100644
index edcebddf1c..0000000000
--- a/gmp/mpn/s390_64/invert_limb.asm
+++ /dev/null
@@ -1,94 +0,0 @@
-dnl S/390-64 mpn_invert_limb
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 142
-C z990 86
-C z9 ?
-C z10 120
-C z196 ?
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_invert_limb)
- stg %r9, 72(%r15)
- srlg %r9, %r2, 55
- agr %r9, %r9
- larl %r4, approx_tab-512
- srlg %r3, %r2, 24
- aghi %r3, 1
- lghi %r5, 1
- llgh %r4, 0(%r9, %r4)
- sllg %r9, %r4, 11
- msgr %r4, %r4
- msgr %r4, %r3
- srlg %r4, %r4, 40
- aghi %r9, -1
- sgr %r9, %r4
- sllg %r0, %r9, 60
- sllg %r1, %r9, 13
- msgr %r9, %r9
- msgr %r9, %r3
- sgr %r0, %r9
- ngr %r5, %r2
- srlg %r4, %r2, 1
- srlg %r3, %r0, 47
- agr %r3, %r1
- agr %r4, %r5
- msgr %r4, %r3
- srlg %r1, %r3, 1
- lcgr %r5, %r5
- ngr %r1, %r5
- sgr %r1, %r4
- mlgr %r0, %r3
- srlg %r9, %r0, 1
- sllg %r4, %r3, 31
- agr %r4, %r9
- lgr %r1, %r4
- mlgr %r0, %r2
- algr %r1, %r2
- alcgr %r0, %r2
- lgr %r2, %r4
- sgr %r2, %r0
- lg %r9, 72(%r15)
- br %r14
-EPILOGUE()
- RODATA
- ALIGN(2)
-approx_tab:
-forloop(i,256,512-1,dnl
-` .word eval(0x7fd00/i)
-')dnl
-ASM_END()
diff --git a/gmp/mpn/s390_64/logops_n.asm b/gmp/mpn/s390_64/logops_n.asm
deleted file mode 100644
index 914cfb6a41..0000000000
--- a/gmp/mpn/s390_64/logops_n.asm
+++ /dev/null
@@ -1,291 +0,0 @@
-dnl S/390-64 logops.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb variant 1 variant 2 variant 3
-C rp!=up rp=up
-C z900 4.5 2.25 5.5 5.5
-C z990 2.75 2 3.25 3.25
-C z9 ? ? ?
-C z10 3.25 3.75 3.75
-C z196 ? ? ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_and_n',`
- define(`func',`mpn_and_n')
- define(`VARIANT_1')
- define(`LOGOPC',`nc')
- define(`LOGOP',`ng')')
-ifdef(`OPERATION_andn_n',`
- define(`func',`mpn_andn_n')
- define(`VARIANT_2')
- define(`LOGOP',`ng')')
-ifdef(`OPERATION_nand_n',`
- define(`func',`mpn_nand_n')
- define(`VARIANT_3')
- define(`LOGOP',`ng')')
-ifdef(`OPERATION_ior_n',`
- define(`func',`mpn_ior_n')
- define(`VARIANT_1')
- define(`LOGOPC',`oc')
- define(`LOGOP',`og')')
-ifdef(`OPERATION_iorn_n',`
- define(`func',`mpn_iorn_n')
- define(`VARIANT_2')
- define(`LOGOP',`og')')
-ifdef(`OPERATION_nior_n',`
- define(`func',`mpn_nior_n')
- define(`VARIANT_3')
- define(`LOGOP',`og')')
-ifdef(`OPERATION_xor_n',`
- define(`func',`mpn_xor_n')
- define(`VARIANT_1')
- define(`LOGOPC',`xc')
- define(`LOGOP',`xg')')
-ifdef(`OPERATION_xnor_n',`
- define(`func',`mpn_xnor_n')
- define(`VARIANT_2')
- define(`LOGOP',`xg')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-ASM_START()
-PROLOGUE(func)
-ifdef(`VARIANT_1',`
- cgr rp, up
- jne L(normal)
-
- sllg n, n, 3
- aghi n, -1
- srlg %r1, n, 8
- ltgr %r1, %r1 C < 256 bytes to copy?
- je L(1)
-
-L(tp): LOGOPC 0(256, rp), 0(vp)
- la rp, 256(rp)
- la vp, 256(vp)
- brctg %r1, L(tp)
-
-L(1): bras %r1, L(2) C make r1 point to mvc insn
- LOGOPC 0(1, rp), 0(vp)
-L(2): ex n, 0(%r1) C execute mvc with length ((n-1) mod 256)+1
-L(rtn): br %r14
-
-
-L(normal):
- stmg %r6, %r8, 48(%r15)
- aghi n, 3
- lghi %r7, 3
- srlg %r0, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lmg %r5, %r7, 0(up)
- la up, 24(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 8(vp)
- LOGOP %r7, 16(vp)
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- la vp, 24(vp)
- j L(mid)
-
-L(b1): lg %r5, 0(up)
- la up, 8(up)
- LOGOP %r5, 0(vp)
- stg %r5, 0(rp)
- la rp, 8(rp)
- la vp, 8(vp)
- j L(mid)
-
-L(b2): lmg %r5, %r6, 0(up)
- la up, 16(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 8(vp)
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- la vp, 16(vp)
- j L(mid)
-
-L(top): lmg %r5, %r8, 0(up)
- la up, 32(up)
- LOGOP %r5, 0(vp)
- LOGOP %r6, 8(vp)
- LOGOP %r7, 16(vp)
- LOGOP %r8, 24(vp)
- stmg %r5, %r8, 0(rp)
- la rp, 32(rp)
- la vp, 32(vp)
-L(mid): brctg %r0, L(top)
-
- lmg %r6, %r8, 48(%r15)
- br %r14
-')
-
-ifdef(`VARIANT_2',`
- stmg %r6, %r8, 48(%r15)
- lghi %r1, -1
-
- aghi n, 3
- lghi %r7, 3
- srlg %r0, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lmg %r5, %r7, 0(vp)
- la vp, 24(vp)
- xgr %r5, %r1
- xgr %r6, %r1
- xgr %r7, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- LOGOP %r7, 16(up)
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- la up, 24(up)
- j L(mid)
-
-L(b1): lg %r5, 0(vp)
- la vp, 8(vp)
- xgr %r5, %r1
- LOGOP %r5, 0(up)
- stg %r5, 0(rp)
- la rp, 8(rp)
- la up, 8(up)
- j L(mid)
-
-L(b2): lmg %r5, %r6, 0(vp)
- la vp, 16(vp)
- xgr %r5, %r1
- xgr %r6, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- la up, 16(up)
- j L(mid)
-
-L(top): lmg %r5, %r8, 0(vp)
- la vp, 32(vp)
- xgr %r5, %r1
- xgr %r6, %r1
- xgr %r7, %r1
- xgr %r8, %r1
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- LOGOP %r7, 16(up)
- LOGOP %r8, 24(up)
- la up, 32(up)
- stmg %r5, %r8, 0(rp)
- la rp, 32(rp)
-L(mid): brctg %r0, L(top)
-
- lmg %r6, %r8, 48(%r15)
- br %r14
-')
-
-ifdef(`VARIANT_3',`
- stmg %r6, %r8, 48(%r15)
- lghi %r1, -1
-
- aghi n, 3
- lghi %r7, 3
- srlg %r0, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(top)
-
-L(b3): lmg %r5, %r7, 0(vp)
- la vp, 24(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- xgr %r5, %r1
- xgr %r6, %r1
- LOGOP %r7, 16(up)
- xgr %r7, %r1
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- la up, 24(up)
- j L(mid)
-
-L(b1): lg %r5, 0(vp)
- la vp, 8(vp)
- LOGOP %r5, 0(up)
- xgr %r5, %r1
- stg %r5, 0(rp)
- la rp, 8(rp)
- la up, 8(up)
- j L(mid)
-
-L(b2): lmg %r5, %r6, 0(vp)
- la vp, 16(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- xgr %r5, %r1
- xgr %r6, %r1
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- la up, 16(up)
- j L(mid)
-
-L(top): lmg %r5, %r8, 0(vp)
- la vp, 32(vp)
- LOGOP %r5, 0(up)
- LOGOP %r6, 8(up)
- xgr %r5, %r1
- xgr %r6, %r1
- LOGOP %r7, 16(up)
- LOGOP %r8, 24(up)
- xgr %r7, %r1
- xgr %r8, %r1
- stmg %r5, %r8, 0(rp)
- la up, 32(up)
- la rp, 32(rp)
-L(mid): brctg %r0, L(top)
-
- lmg %r6, %r8, 48(%r15)
- br %r14
-')
-
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/lshift.asm b/gmp/mpn/s390_64/lshift.asm
deleted file mode 100644
index 4dae035a62..0000000000
--- a/gmp/mpn/s390_64/lshift.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl S/390-64 mpn_lshift.
-
-dnl Copyright 2011, 2012, 2014 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 7
-C z990 3
-C z9 ?
-C z10 6
-C z196 ?
-
-C NOTES
-C * This uses discrete loads and stores in a software pipeline. Using lmg and
-C stmg is not faster.
-C * One could assume more pipelining could approach 2.5 c/l, but we have not
-C found any 8-way loop that runs better than the current 4-way loop.
-C * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
-C similarly to the x86_64 sqr_basecase feed-in.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-define(`tnc', `%r6')
-
-ASM_START()
-PROLOGUE(mpn_lshift)
- cghi n, 3
- jh L(gt1)
-
- stmg %r6, %r7, 48(%r15)
- larl %r1, L(tab)-4
- lcgr tnc, cnt
- sllg n, n, 2
- b 0(n,%r1)
-L(tab): j L(n1)
- j L(n2)
- j L(n3)
-
-L(n1): lg %r1, 0(up)
- sllg %r0, %r1, 0(cnt)
- stg %r0, 0(rp)
- srlg %r2, %r1, 0(tnc)
- lg %r6, 48(%r15) C restoring r7 not needed
- br %r14
-
-L(n2): lg %r1, 8(up)
- srlg %r4, %r1, 0(tnc)
- sllg %r0, %r1, 0(cnt)
- j L(cj)
-
-L(n3): lg %r1, 16(up)
- srlg %r4, %r1, 0(tnc)
- sllg %r0, %r1, 0(cnt)
- lg %r1, 8(up)
- srlg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- sllg %r0, %r1, 0(cnt)
- stg %r7, 16(rp)
-L(cj): lg %r1, 0(up)
- srlg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- sllg %r0, %r1, 0(cnt)
- stg %r7, 8(rp)
- stg %r0, 0(rp)
- lgr %r2, %r4
- lmg %r6, %r7, 48(%r15)
- br %r14
-
-L(gt1): stmg %r6, %r13, 48(%r15)
- lcgr tnc, cnt C tnc = -cnt
-
- sllg %r1, n, 3
- srlg %r0, n, 2 C loop count
-
- agr up, %r1 C point up at end of U
- agr rp, %r1 C point rp at end of R
- aghi up, -56
- aghi rp, -40
-
- lghi %r7, 3
- ngr %r7, n
- je L(b0)
- cghi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): lg %r7, 48(up)
- srlg %r9, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 40(up)
- lg %r7, 32(up)
- srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- la rp, 16(rp)
- j L(lm3)
-
-L(b2): lg %r8, 48(up)
- lg %r7, 40(up)
- srlg %r9, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- la rp, 24(rp)
- la up, 8(up)
- j L(lm2)
-
-L(b1): lg %r7, 48(up)
- srlg %r9, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 40(up)
- lg %r7, 32(up)
- srlg %r4, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- la rp, 32(rp)
- la up, 16(up)
- j L(lm1)
-
-L(b0): lg %r8, 48(up)
- lg %r7, 40(up)
- srlg %r9, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- la rp, 40(rp)
- la up, 24(up)
- j L(lm0)
-
- ALIGN(8)
-L(top): srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r10, 24(rp)
-L(lm3): stg %r11, 16(rp)
-L(lm2): srlg %r12, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 24(up)
- lg %r7, 16(up)
- ogr %r13, %r12
- srlg %r4, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r13, 8(rp)
-L(lm1): stg %r11, 0(rp)
-L(lm0): srlg %r12, %r7, 0(tnc)
- aghi rp, -32
- sllg %r11, %r7, 0(cnt)
- lg %r8, 8(up)
- lg %r7, 0(up)
- aghi up, -32
- ogr %r10, %r12
- brctg %r0, L(top)
-
-L(end): srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r10, 24(rp)
- stg %r11, 16(rp)
- srlg %r12, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- ogr %r13, %r12
- stg %r13, 8(rp)
- stg %r11, 0(rp)
- lgr %r2, %r9
-
- lmg %r6, %r13, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/lshiftc.asm b/gmp/mpn/s390_64/lshiftc.asm
deleted file mode 100644
index 92552d529a..0000000000
--- a/gmp/mpn/s390_64/lshiftc.asm
+++ /dev/null
@@ -1,207 +0,0 @@
-dnl S/390-64 mpn_lshiftc.
-
-dnl Copyright 2011, 2014 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 9
-C z990 3.5
-C z9 ?
-C z10 7
-C z196 ?
-
-C NOTES
-C * See notes in lshift.asm.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-define(`tnc', `%r6')
-
-ASM_START()
-PROLOGUE(mpn_lshiftc)
- cghi n, 3
- jh L(gt1)
-
- stmg %r6, %r8, 48(%r15)
- larl %r1, L(tab)-4
- lcgr tnc, cnt
- sllg n, n, 2
- lghi %r8, -1
- b 0(n,%r1)
-L(tab): j L(n1)
- j L(n2)
- j L(n3)
-
-L(n1): lg %r1, 0(up)
- sllg %r0, %r1, 0(cnt)
- xgr %r0, %r8
- stg %r0, 0(rp)
- srlg %r2, %r1, 0(tnc)
- lmg %r6, %r8, 48(%r15)
- br %r14
-
-L(n2): lg %r1, 8(up)
- srlg %r4, %r1, 0(tnc)
- sllg %r0, %r1, 0(cnt)
- j L(cj)
-
-L(n3): lg %r1, 16(up)
- srlg %r4, %r1, 0(tnc)
- sllg %r0, %r1, 0(cnt)
- lg %r1, 8(up)
- srlg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- sllg %r0, %r1, 0(cnt)
- xgr %r7, %r8
- stg %r7, 16(rp)
-L(cj): lg %r1, 0(up)
- srlg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- sllg %r0, %r1, 0(cnt)
- xgr %r7, %r8
- xgr %r0, %r8
- stg %r7, 8(rp)
- stg %r0, 0(rp)
- lgr %r2, %r4
- lmg %r6, %r8, 48(%r15)
- br %r14
-
-L(gt1): stmg %r6, %r14, 48(%r15)
- lcgr tnc, cnt C tnc = -cnt
-
- sllg %r1, n, 3
- srlg %r0, n, 2 C loop count
-
- agr up, %r1 C point up at end of U
- agr rp, %r1 C point rp at end of R
- aghi up, -56
- aghi rp, -40
-
- lghi %r7, 3
- lghi %r14, -1
- ngr %r7, n
- je L(b0)
- cghi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): lg %r7, 48(up)
- srlg %r9, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 40(up)
- lg %r7, 32(up)
- srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- la rp, 16(rp)
- xgr %r11, %r14
- j L(lm3)
-
-L(b2): lg %r8, 48(up)
- lg %r7, 40(up)
- srlg %r9, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- la rp, 24(rp)
- la up, 8(up)
- j L(lm2)
-
-L(b1): lg %r7, 48(up)
- srlg %r9, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 40(up)
- lg %r7, 32(up)
- srlg %r4, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- la rp, 32(rp)
- la up, 16(up)
- xgr %r11, %r14
- j L(lm1)
-
-L(b0): lg %r8, 48(up)
- lg %r7, 40(up)
- srlg %r9, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- la rp, 40(rp)
- la up, 24(up)
- j L(lm0)
-
- ALIGN(8)
-L(top): srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- xgr %r10, %r14
- xgr %r11, %r14
- stg %r10, 24(rp)
-L(lm3): stg %r11, 16(rp)
-L(lm2): srlg %r12, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- lg %r8, 24(up)
- lg %r7, 16(up)
- ogr %r13, %r12
- srlg %r4, %r8, 0(tnc)
- sllg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- xgr %r13, %r14
- xgr %r11, %r14
- stg %r13, 8(rp)
-L(lm1): stg %r11, 0(rp)
-L(lm0): srlg %r12, %r7, 0(tnc)
- aghi rp, -32
- sllg %r11, %r7, 0(cnt)
- lg %r8, 8(up)
- lg %r7, 0(up)
- aghi up, -32
- ogr %r10, %r12
- brctg %r0, L(top)
-
-L(end): srlg %r4, %r8, 0(tnc)
- sllg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- xgr %r10, %r14
- xgr %r11, %r14
- stg %r10, 24(rp)
- stg %r11, 16(rp)
- srlg %r12, %r7, 0(tnc)
- sllg %r11, %r7, 0(cnt)
- ogr %r13, %r12
- xgr %r13, %r14
- xgr %r11, %r14
- stg %r13, 8(rp)
- stg %r11, 0(rp)
- lgr %r2, %r9
-
- lmg %r6, %r14, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/mod_34lsub1.asm b/gmp/mpn/s390_64/mod_34lsub1.asm
deleted file mode 100644
index fd40011a8c..0000000000
--- a/gmp/mpn/s390_64/mod_34lsub1.asm
+++ /dev/null
@@ -1,109 +0,0 @@
-dnl S/390-64 mpn_mod_34lsub1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 5.8
-C z990 2
-C z9 ?
-C z10 4.5
-C z196 ?
-
-C TODO
-C * Optimise summation code, see x86_64.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`n', `%r3')
-
-ASM_START()
-PROLOGUE(mpn_mod_34lsub1)
- stmg %r7, %r12, 56(%r15)
- lghi %r11, 0
- lghi %r12, 0
- lghi %r0, 0
- lghi %r8, 0
- lghi %r9, 0
- lghi %r10, 0
- lghi %r7, 0
- aghi %r3, -3
- jl .L3
-
-L(top): alg %r0, 0(%r2)
- alcg %r12, 8(%r2)
- alcg %r11, 16(%r2)
- alcgr %r8, %r7
- la %r2, 24(%r2)
- aghi %r3, -3
- jnl L(top)
-
- lgr %r7, %r8
- srlg %r1, %r11, 16
- nihh %r7, 0 C 0xffffffffffff
- agr %r7, %r1
- srlg %r8, %r8, 48
- agr %r7, %r8
- sllg %r11, %r11, 32
- nihh %r11, 0
- agr %r7, %r11
-.L3:
- cghi %r3, -3
- je .L6
- alg %r0, 0(%r2)
- alcgr %r10, %r10
- cghi %r3, -2
- je .L6
- alg %r12, 8(%r2)
- alcgr %r9, %r9
-.L6:
- srlg %r1, %r0, 48
- nihh %r0, 0 C 0xffffffffffff
- agr %r0, %r1
- agr %r0, %r7
- srlg %r1, %r12, 32
- agr %r0, %r1
- srlg %r1, %r10, 32
- agr %r0, %r1
- llgfr %r12, %r12
- srlg %r1, %r9, 16
- sllg %r12, %r12, 16
- llgfr %r10, %r10
- agr %r0, %r1
- llill %r2, 65535
- agr %r0, %r12
- sllg %r10, %r10, 16
- ngr %r2, %r9
- agr %r0, %r10
- sllg %r2, %r2, 32
- agr %r2, %r0
- lmg %r7, %r12, 56(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/mul_1.asm b/gmp/mpn/s390_64/mul_1.asm
deleted file mode 100644
index a8f6da9a0f..0000000000
--- a/gmp/mpn/s390_64/mul_1.asm
+++ /dev/null
@@ -1,66 +0,0 @@
-dnl S/390-64 mpn_mul_1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 29
-C z990 22
-C z9 ?
-C z10 20
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- stmg %r11, %r12, 88(%r15)
- lghi %r12, 0 C zero index register
- aghi %r12, 0 C clear carry flag
- lghi %r11, 0 C clear carry limb
-
-L(top): lg %r1, 0(%r12,up)
- mlgr %r0, v0
- alcgr %r1, %r11
- lgr %r11, %r0 C copy high part to carry limb
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg n, L(top)
-
- lghi %r2, 0
- alcgr %r2, %r11
-
- lmg %r11, %r12, 88(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/mul_basecase.asm b/gmp/mpn/s390_64/mul_basecase.asm
deleted file mode 100644
index 7d14ea98d2..0000000000
--- a/gmp/mpn/s390_64/mul_basecase.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-dnl S/390-64 mpn_mul_basecase.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 ?
-C z990 23
-C z9 ?
-C z10 28
-C z196 ?
-
-C TODO
-C * Perhaps add special case for un <= 2.
-C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
-C up by about 10%.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`un', `%r4')
-define(`vp', `%r5')
-define(`vn', `%r6')
-
-define(`zero', `%r8')
-
-ASM_START()
-PROLOGUE(mpn_mul_basecase)
- cghi un, 2
- jhe L(ge2)
-
-C un = vn = 1
- lg %r1, 0(vp)
- mlg %r0, 0(up)
- stg %r1, 0(rp)
- stg %r0, 8(rp)
- br %r14
-
-L(ge2): C jne L(gen)
-
-
-L(gen):
-C mul_1 =======================================================================
-
- stmg %r6, %r12, 48(%r15)
- lghi zero, 0
- aghi un, -1
-
- lg %r7, 0(vp)
- lg %r11, 0(up)
- lghi %r12, 8 C init index register
- mlgr %r10, %r7
- lgr %r9, un
- stg %r11, 0(rp)
- cr %r15, %r15 C clear carry flag
-
-L(tm): lg %r1, 0(%r12,up)
- mlgr %r0, %r7
- alcgr %r1, %r10
- lgr %r10, %r0 C copy high part to carry limb
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg %r9, L(tm)
-
- alcgr %r0, zero
- stg %r0, 0(%r12,rp)
-
-C addmul_1 loop ===============================================================
-
- aghi vn, -1
- je L(outer_end)
-L(outer_loop):
-
- la rp, 8(rp) C rp += 1
- la vp, 8(vp) C up += 1
- lg %r7, 0(vp)
- lg %r11, 0(up)
- lghi %r12, 8 C init index register
- mlgr %r10, %r7
- lgr %r9, un
- alg %r11, 0(rp)
- stg %r11, 0(rp)
-
-L(tam): lg %r1, 0(%r12,up)
- lg %r11, 0(%r12,rp)
- mlgr %r0, %r7
- alcgr %r1, %r11
- alcgr %r0, zero
- algr %r1, %r10
- lgr %r10, %r0
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg %r9, L(tam)
-
- alcgr %r0, zero
- stg %r0, 0(%r12,rp)
-
- brctg vn, L(outer_loop)
-L(outer_end):
-
- lmg %r6, %r12, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/rshift.asm b/gmp/mpn/s390_64/rshift.asm
deleted file mode 100644
index e870971650..0000000000
--- a/gmp/mpn/s390_64/rshift.asm
+++ /dev/null
@@ -1,195 +0,0 @@
-dnl S/390-64 mpn_rshift.
-
-dnl Copyright 2011, 2014 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 7
-C z990 3
-C z9 ?
-C z10 6
-C z196 ?
-
-C NOTES
-C * See notes in lshift.asm.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`cnt', `%r5')
-
-define(`tnc', `%r6')
-
-ASM_START()
-PROLOGUE(mpn_rshift)
- cghi n, 3
- jh L(gt1)
-
- stmg %r6, %r7, 48(%r15)
- larl %r1, L(tab)-4
- lcgr tnc, cnt
- sllg n, n, 2
- b 0(n,%r1)
-L(tab): j L(n1)
- j L(n2)
- j L(n3)
-
-L(n1): lg %r1, 0(up)
- srlg %r0, %r1, 0(cnt)
- stg %r0, 0(rp)
- sllg %r2, %r1, 0(tnc)
- lg %r6, 48(%r15) C restoring r7 not needed
- br %r14
-
-L(n2): lg %r1, 0(up)
- sllg %r4, %r1, 0(tnc)
- srlg %r0, %r1, 0(cnt)
- lg %r1, 8(up)
- sllg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- srlg %r0, %r1, 0(cnt)
- stg %r7, 0(rp)
- stg %r0, 8(rp)
- lgr %r2, %r4
- lmg %r6, %r7, 48(%r15)
- br %r14
-
-
-L(n3): lg %r1, 0(up)
- sllg %r4, %r1, 0(tnc)
- srlg %r0, %r1, 0(cnt)
- lg %r1, 8(up)
- sllg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- srlg %r0, %r1, 0(cnt)
- stg %r7, 0(rp)
- lg %r1, 16(up)
- sllg %r7, %r1, 0(tnc)
- ogr %r7, %r0
- srlg %r0, %r1, 0(cnt)
- stg %r7, 8(rp)
- stg %r0, 16(rp)
- lgr %r2, %r4
- lmg %r6, %r7, 48(%r15)
- br %r14
-
-L(gt1): stmg %r6, %r13, 48(%r15)
- lcgr tnc, cnt C tnc = -cnt
-
- sllg %r1, n, 3
- srlg %r0, n, 2 C loop count
-
- lghi %r7, 3
- ngr %r7, n
- je L(b0)
- cghi %r7, 2
- jl L(b1)
- je L(b2)
-
-L(b3): aghi rp, -8
- lg %r7, 0(up)
- sllg %r9, %r7, 0(tnc)
- srlg %r11, %r7, 0(cnt)
- lg %r8, 8(up)
- lg %r7, 16(up)
- sllg %r4, %r8, 0(tnc)
- srlg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- la up, 24(up)
- j L(lm3)
-
-L(b2): aghi rp, -16
- lg %r8, 0(up)
- lg %r7, 8(up)
- sllg %r9, %r8, 0(tnc)
- srlg %r13, %r8, 0(cnt)
- la up, 16(up)
- j L(lm2)
-
-L(b1): aghi rp, -24
- lg %r7, 0(up)
- sllg %r9, %r7, 0(tnc)
- srlg %r11, %r7, 0(cnt)
- lg %r8, 8(up)
- lg %r7, 16(up)
- sllg %r4, %r8, 0(tnc)
- srlg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- la up, 8(up)
- j L(lm1)
-
-L(b0): aghi rp, -32
- lg %r8, 0(up)
- lg %r7, 8(up)
- sllg %r9, %r8, 0(tnc)
- srlg %r10, %r8, 0(cnt)
- j L(lm0)
-
- ALIGN(8)
-L(top): sllg %r4, %r8, 0(tnc)
- srlg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r10, 0(rp)
-L(lm3): stg %r11, 8(rp)
-L(lm2): sllg %r12, %r7, 0(tnc)
- srlg %r11, %r7, 0(cnt)
- lg %r8, 0(up)
- lg %r7, 8(up)
- ogr %r13, %r12
- sllg %r4, %r8, 0(tnc)
- srlg %r10, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r13, 16(rp)
-L(lm1): stg %r11, 24(rp)
-L(lm0): sllg %r12, %r7, 0(tnc)
- aghi rp, 32
- srlg %r11, %r7, 0(cnt)
- lg %r8, 16(up)
- lg %r7, 24(up)
- aghi up, 32
- ogr %r10, %r12
- brctg %r0, L(top)
-
-L(end): sllg %r4, %r8, 0(tnc)
- srlg %r13, %r8, 0(cnt)
- ogr %r11, %r4
- stg %r10, 0(rp)
- stg %r11, 8(rp)
- sllg %r12, %r7, 0(tnc)
- srlg %r11, %r7, 0(cnt)
- ogr %r13, %r12
- stg %r13, 16(rp)
- stg %r11, 24(rp)
- lgr %r2, %r9
-
- lmg %r6, %r13, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/sqr_basecase.asm b/gmp/mpn/s390_64/sqr_basecase.asm
deleted file mode 100644
index bf31bd5546..0000000000
--- a/gmp/mpn/s390_64/sqr_basecase.asm
+++ /dev/null
@@ -1,203 +0,0 @@
-dnl S/390-64 mpn_sqr_basecase.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 ?
-C z990 23
-C z9 ?
-C z10 28
-C z196 ?
-
-C TODO
-C * Clean up.
-C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
-C This will ask for basecase handling of n = 3.
-C * Update counters and pointers more straightforwardly, possibly lowering
-C register usage.
-C * Should we use this allocation-free style for more sqr_basecase asm
-C implementations? The only disadvantage is that it requires R != U.
-C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
-C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even
-C more.
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-
-define(`zero', `%r8')
-define(`rp_saved', `%r9')
-define(`up_saved', `%r13')
-define(`n_saved', `%r14')
-
-ASM_START()
-PROLOGUE(mpn_sqr_basecase)
- aghi n, -2
- jhe L(ge2)
-
-C n = 1
- lg %r5, 0(up)
- mlgr %r4, %r5
- stg %r5, 0(rp)
- stg %r4, 8(rp)
- br %r14
-
-L(ge2): jne L(gen)
-
-C n = 2
- stmg %r6, %r8, 48(%r15)
- lghi zero, 0
-
- lg %r5, 0(up)
- mlgr %r4, %r5 C u0 * u0
- lg %r1, 8(up)
- mlgr %r0, %r1 C u1 * u1
- stg %r5, 0(rp)
-
- lg %r7, 0(up)
- mlg %r6, 8(up) C u0 * u1
- algr %r7, %r7
- alcgr %r6, %r6
- alcgr %r0, zero
-
- algr %r4, %r7
- alcgr %r1, %r6
- alcgr %r0, zero
- stg %r4, 8(rp)
- stg %r1, 16(rp)
- stg %r0, 24(rp)
-
- lmg %r6, %r8, 48(%r15)
- br %r14
-
-L(gen):
-C mul_1 =======================================================================
-
- stmg %r6, %r14, 48(%r15)
- lghi zero, 0
- lgr up_saved, up
- lgr rp_saved, rp
- lgr n_saved, n
-
- lg %r6, 0(up)
- lg %r11, 8(up)
- lghi %r12, 16 C init index register
- mlgr %r10, %r6
- lgr %r5, n
- stg %r11, 8(rp)
- cr %r15, %r15 C clear carry flag
-
-L(tm): lg %r1, 0(%r12,up)
- mlgr %r0, %r6
- alcgr %r1, %r10
- lgr %r10, %r0 C copy high part to carry limb
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg %r5, L(tm)
-
- alcgr %r0, zero
- stg %r0, 0(%r12,rp)
-
-C addmul_1 loop ===============================================================
-
- aghi n, -1
- je L(outer_end)
-L(outer_loop):
-
- la rp, 16(rp) C rp += 2
- la up, 8(up) C up += 1
- lg %r6, 0(up)
- lg %r11, 8(up)
- lghi %r12, 16 C init index register
- mlgr %r10, %r6
- lgr %r5, n
- alg %r11, 8(rp)
- stg %r11, 8(rp)
-
-L(tam): lg %r1, 0(%r12,up)
- lg %r7, 0(%r12,rp)
- mlgr %r0, %r6
- alcgr %r1, %r7
- alcgr %r0, zero
- algr %r1, %r10
- lgr %r10, %r0
- stg %r1, 0(%r12,rp)
- la %r12, 8(%r12)
- brctg %r5, L(tam)
-
- alcgr %r0, zero
- stg %r0, 0(%r12,rp)
-
- brctg n, L(outer_loop)
-L(outer_end):
-
- lg %r6, 8(up)
- lg %r1, 16(up)
- lgr %r7, %r0 C Same as: lg %r7, 24(,rp)
- mlgr %r0, %r6
- algr %r1, %r7
- alcgr %r0, zero
- stg %r1, 24(rp)
- stg %r0, 32(rp)
-
-C sqr_diag_addlsh1 ============================================================
-
-define(`up', `up_saved')
-define(`rp', `rp_saved')
- la n, 1(n_saved)
-
- lg %r1, 0(up)
- mlgr %r0, %r1
- stg %r1, 0(rp)
-C clr %r15, %r15 C clear carry (already clear per above)
-
-L(top): lg %r11, 8(up)
- la up, 8(up)
- lg %r6, 8(rp)
- lg %r7, 16(rp)
- mlgr %r10, %r11
- alcgr %r6, %r6
- alcgr %r7, %r7
- alcgr %r10, zero C propagate carry to high product limb
- algr %r6, %r0
- alcgr %r7, %r11
- stmg %r6, %r7, 8(rp)
- la rp, 16(rp)
- lgr %r0, %r10 C copy carry limb
- brctg n, L(top)
-
- alcgr %r0, zero
- stg %r0, 8(rp)
-
- lmg %r6, %r14, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/sublsh1_n.asm b/gmp/mpn/s390_64/sublsh1_n.asm
deleted file mode 100644
index 50f127acef..0000000000
--- a/gmp/mpn/s390_64/sublsh1_n.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl S/390-64 mpn_sublsh1_n
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 10
-C z990 5
-C z9 ?
-C z10 12
-C z196 ?
-
-C TODO
-C * Optimise for small n
-C * Compute RETVAL for sublsh1_n less stupidly
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`vp', `%r4')
-define(`n', `%r5')
-
-ifdef(`OPERATION_addlsh1_n',`
- define(ADSBR, algr)
- define(ADSBCR, alcgr)
- define(INITCY, `lghi %r13, -1')
- define(RETVAL, `la %r2, 2(%r1,%r13)')
- define(func, mpn_addlsh1_n)
-')
-ifdef(`OPERATION_sublsh1_n',`
- define(ADSBR, slgr)
- define(ADSBCR, slbgr)
- define(INITCY, `lghi %r13, 0')
- define(RETVAL,`dnl
- slgr %r1, %r13
- lghi %r2, 1
- algr %r2, %r1')
- define(func, mpn_sublsh1_n)
-')
-
-ASM_START()
-PROLOGUE(mpn_sublsh1_n)
- stmg %r6, %r13, 48(%r15)
-
- aghi n, 3
- lghi %r7, 3
- srlg %r0, n, 2
- ngr %r7, n C n mod 4
- je L(b1)
- cghi %r7, 2
- jl L(b2)
- jne L(b0)
-
-L(b3): lmg %r5, %r7, 0(up)
- la up, 24(up)
- lmg %r9, %r11, 0(vp)
- la vp, 24(vp)
-
- algr %r9, %r9
- alcgr %r10, %r10
- alcgr %r11, %r11
- slbgr %r1, %r1
-
- ADSBR %r5, %r9
- ADSBCR %r6, %r10
- ADSBCR %r7, %r11
- slbgr %r13, %r13
-
- stmg %r5, %r7, 0(rp)
- la rp, 24(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(b0): lghi %r1, -1
- INITCY
- j L(top)
-
-L(b1): lg %r5, 0(up)
- la up, 8(up)
- lg %r9, 0(vp)
- la vp, 8(vp)
-
- algr %r9, %r9
- slbgr %r1, %r1
- ADSBR %r5, %r9
- slbgr %r13, %r13
-
- stg %r5, 0(rp)
- la rp, 8(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(b2): lmg %r5, %r6, 0(up)
- la up, 16(up)
- lmg %r9, %r10, 0(vp)
- la vp, 16(vp)
-
- algr %r9, %r9
- alcgr %r10, %r10
- slbgr %r1, %r1
-
- ADSBR %r5, %r9
- ADSBCR %r6, %r10
- slbgr %r13, %r13
-
- stmg %r5, %r6, 0(rp)
- la rp, 16(rp)
- brctg %r0, L(top)
- j L(end)
-
-L(top): lmg %r9, %r12, 0(vp)
- la vp, 32(vp)
-
- aghi %r1, 1 C restore carry
-
- alcgr %r9, %r9
- alcgr %r10, %r10
- alcgr %r11, %r11
- alcgr %r12, %r12
-
- slbgr %r1, %r1 C save carry
-
- lmg %r5, %r8, 0(up)
- la up, 32(up)
-
- aghi %r13, 1 C restore carry
-
- ADSBCR %r5, %r9
- ADSBCR %r6, %r10
- ADSBCR %r7, %r11
- ADSBCR %r8, %r12
-
- slbgr %r13, %r13 C save carry
-
- stmg %r5, %r8, 0(rp)
- la rp, 32(rp)
- brctg %r0, L(top)
-
-L(end): RETVAL
- lmg %r6, %r13, 48(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/submul_1.asm b/gmp/mpn/s390_64/submul_1.asm
deleted file mode 100644
index 91c4b06631..0000000000
--- a/gmp/mpn/s390_64/submul_1.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-dnl S/390-64 mpn_submul_1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C z900 35
-C z990 24
-C z9 ?
-C z10 28
-C z196 ?
-
-C INPUT PARAMETERS
-define(`rp', `%r2')
-define(`up', `%r3')
-define(`n', `%r4')
-define(`v0', `%r5')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- stmg %r9, %r12, 72(%r15)
- lghi %r12, 0
- slgr %r11, %r11
-
-L(top): lg %r1, 0(%r12, up)
- lg %r10, 0(%r12, rp)
- mlgr %r0, v0
- slbgr %r10, %r1
- slbgr %r9, %r9
- slgr %r0, %r9 C conditional incr
- slgr %r10, %r11
- lgr %r11, %r0
- stg %r10, 0(%r12, rp)
- la %r12, 8(%r12)
- brctg %r4, L(top)
-
- lgr %r2, %r11
- slbgr %r9, %r9
- slgr %r2, %r9
-
- lmg %r9, %r12, 72(%r15)
- br %r14
-EPILOGUE()
diff --git a/gmp/mpn/s390_64/z10/gmp-mparam.h b/gmp/mpn/s390_64/z10/gmp-mparam.h
deleted file mode 100644
index c034f9b3b6..0000000000
--- a/gmp/mpn/s390_64/z10/gmp-mparam.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 4400 MHz IBM z10 */
-/* FFT tuning limit = 15000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.7 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 4
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 29
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 56
-
-#define MUL_TOOM22_THRESHOLD 8
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 88
-#define MUL_TOOM6H_THRESHOLD 125
-#define MUL_TOOM8H_THRESHOLD 163
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 58
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 61
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 62
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 12
-#define SQR_TOOM3_THRESHOLD 89
-#define SQR_TOOM4_THRESHOLD 130
-#define SQR_TOOM6_THRESHOLD 189
-#define SQR_TOOM8_THRESHOLD 260
-
-#define MULMID_TOOM42_THRESHOLD 24
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 9
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 7, 4}, { 15, 5}, { 9, 6}, \
- { 5, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 11, 7}, { 6, 6}, { 13, 7}, { 7, 6}, \
- { 15, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \
- { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \
- { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
- { 27,10}, { 15, 9}, { 39,10}, { 23,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 7}, { 511,10}, { 71, 9}, \
- { 143, 8}, { 287, 7}, { 575,10}, { 79,11}, \
- { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511,10}, { 143, 9}, { 287, 8}, \
- { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \
- { 639,10}, { 175, 9}, { 351, 8}, { 703, 7}, \
- { 1407, 6}, { 2815,10}, { 191, 9}, { 383, 8}, \
- { 767, 9}, { 415,11}, { 111,10}, { 223, 9}, \
- { 447, 8}, { 895,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575, 8}, { 1151,10}, { 319, 9}, { 639,11}, \
- { 175, 9}, { 703, 8}, { 1407, 7}, { 2815,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \
- { 1407, 8}, { 2815,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,12}, { 223,10}, \
- { 895, 9}, { 1791,11}, { 479,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \
- { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
- { 351,11}, { 703,10}, { 1407, 9}, { 2815,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \
- { 1791, 9}, { 3583,12}, { 479,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
- { 1151,13}, { 319,12}, { 703,11}, { 1407,10}, \
- { 2815,13}, { 383,12}, { 767,11}, { 1535,12}, \
- { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
- { 1791,10}, { 3583,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
- { 1279,13}, { 703,12}, { 1407,11}, { 2815,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1663,13}, { 895,12}, { 1791,11}, { 3583,15}, \
- { 255,14}, { 511,13}, { 1151,14}, { 639,13}, \
- { 1279,12}, { 2559,13}, { 1407,12}, { 2815,13}, \
- { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 205
-#define MUL_FFT_THRESHOLD 1728
-
-#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 212, 5}, { 7, 4}, { 15, 5}, { 11, 6}, \
- { 6, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
- { 7, 7}, { 16, 8}, { 9, 7}, { 19, 8}, \
- { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \
- { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \
- { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287, 7}, { 575,10}, { 79,11}, { 47,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \
- { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
- { 175, 9}, { 351, 8}, { 703,10}, { 191, 9}, \
- { 383, 8}, { 767,10}, { 207,11}, { 111,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1151,11}, \
- { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
- { 351, 9}, { 703,12}, { 95,11}, { 191,10}, \
- { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,10}, { 895, 9}, \
- { 1791,13}, { 127,12}, { 255,11}, { 511,12}, \
- { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \
- { 639,12}, { 351,11}, { 703,10}, { 1407,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \
- { 1791, 9}, { 3583,12}, { 479,11}, { 959,10}, \
- { 1919,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1023,12}, { 575,11}, { 1151,13}, { 319,12}, \
- { 639,11}, { 1279,12}, { 703,11}, { 1407,10}, \
- { 2815,13}, { 383,12}, { 767,11}, { 1535,12}, \
- { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
- { 1791,12}, { 959,11}, { 1919,14}, { 255,13}, \
- { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \
- { 639,12}, { 1279,13}, { 703,12}, { 1407,11}, \
- { 2815,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,11}, \
- { 3583,13}, { 959,12}, { 1919,15}, { 255,14}, \
- { 511,13}, { 1023,12}, { 2047,13}, { 1151,14}, \
- { 639,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
- { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 201
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 34
-#define MULLO_MUL_N_THRESHOLD 3176
-
-#define DC_DIV_QR_THRESHOLD 39
-#define DC_DIVAPPR_Q_THRESHOLD 151
-#define DC_BDIV_QR_THRESHOLD 44
-#define DC_BDIV_Q_THRESHOLD 107
-
-#define INV_MULMOD_BNM1_THRESHOLD 14
-#define INV_NEWTON_THRESHOLD 163
-#define INV_APPR_THRESHOLD 154
-
-#define BINV_NEWTON_THRESHOLD 171
-#define REDC_1_TO_REDC_N_THRESHOLD 46
-
-#define MU_DIV_QR_THRESHOLD 792
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 81
-#define MU_BDIV_QR_THRESHOLD 654
-#define MU_BDIV_Q_THRESHOLD 792
-
-#define POWM_SEC_TABLE 3,19,194,946,2424
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 117
-#define HGCD_APPR_THRESHOLD 145
-#define HGCD_REDUCE_THRESHOLD 1329
-#define GCD_DC_THRESHOLD 318
-#define GCDEXT_DC_THRESHOLD 265
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 35
-#define SET_STR_DC_THRESHOLD 1015
-#define SET_STR_PRECOMPUTE_THRESHOLD 2047
-
-#define FAC_DSC_THRESHOLD 330
-#define FAC_ODD_THRESHOLD 23
diff --git a/gmp/mpn/sh/add_n.asm b/gmp/mpn/sh/add_n.asm
deleted file mode 100644
index 79d17d0129..0000000000
--- a/gmp/mpn/sh/add_n.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-dnl SH mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
-dnl in a third limb vector.
-
-dnl Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-C rp r4
-C up r5
-C vp r6
-C n r7
-
-changecom(blah) C disable # to make all C comments below work
-
-ASM_START()
-PROLOGUE(mpn_add_n)
- mov #0,r3 C clear cy save reg
-
-L(top): mov.l @r5+,r1
- mov.l @r6+,r2
- shlr r3 C restore cy
- addc r2,r1
- movt r3 C save cy
- mov.l r1,@r4
- dt r7
- bf.s L(top)
- add #4,r4
-
- rts
- mov r3,r0 C return carry-out from most significant limb
-EPILOGUE()
diff --git a/gmp/mpn/sh/add_n.s b/gmp/mpn/sh/add_n.s
new file mode 100644
index 0000000000..914fb4fc70
--- /dev/null
+++ b/gmp/mpn/sh/add_n.s
@@ -0,0 +1,45 @@
+! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! s2_ptr r6
+! size r7
+
+ .text
+ .align 2
+ .global ___gmpn_add_n
+___gmpn_add_n:
+ mov #0,r3 ! clear cy save reg
+
+Loop: mov.l @r5+,r1
+ mov.l @r6+,r2
+ shlr r3 ! restore cy
+ addc r2,r1
+ movt r3 ! save cy
+ mov.l r1,@r4
+ dt r7
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r3,r0 ! return carry-out from most sign. limb
diff --git a/gmp/mpn/sh/sh2/addmul_1.asm b/gmp/mpn/sh/sh2/addmul_1.asm
deleted file mode 100644
index c914b29541..0000000000
--- a/gmp/mpn/sh/sh2/addmul_1.asm
+++ /dev/null
@@ -1,65 +0,0 @@
-dnl SH2 mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
-dnl to a second limb vector.
-
-dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-C res_ptr r4
-C s1_ptr r5
-C size r6
-C s2_limb r7
-
-changecom(blah) C disable # to make all C comments below work
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- mov #0,r2 C cy_limb = 0
- mov #0,r0 C Keep r0 = 0 for entire loop
- clrt
-
-L(top): mov.l @r5+,r3
- dmulu.l r3,r7
- sts macl,r1
- addc r2,r1 C lo_prod += old cy_limb
- sts mach,r2 C new cy_limb = hi_prod
- mov.l @r4,r3
- addc r0,r2 C cy_limb += T, T = 0
- addc r3,r1
- addc r0,r2 C cy_limb += T, T = 0
- dt r6
- mov.l r1,@r4
- bf.s L(top)
- add #4,r4
-
- rts
- mov r2,r0
-EPILOGUE()
diff --git a/gmp/mpn/sh/sh2/addmul_1.s b/gmp/mpn/sh/sh2/addmul_1.s
new file mode 100644
index 0000000000..df22deaf5c
--- /dev/null
+++ b/gmp/mpn/sh/sh2/addmul_1.s
@@ -0,0 +1,51 @@
+! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+! the result to a second limb vector.
+
+! Copyright 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_addmul_1
+___gmpn_addmul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1 ! lo_prod += old cy_limb
+ sts mach,r2 ! new cy_limb = hi_prod
+ mov.l @r4,r3
+ addc r0,r2 ! cy_limb += T, T = 0
+ addc r3,r1
+ addc r0,r2 ! cy_limb += T, T = 0
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/gmp/mpn/sh/sh2/mul_1.asm b/gmp/mpn/sh/sh2/mul_1.asm
deleted file mode 100644
index 83548a6953..0000000000
--- a/gmp/mpn/sh/sh2/mul_1.asm
+++ /dev/null
@@ -1,62 +0,0 @@
-dnl SH2 mpn_mul_1 -- Multiply a limb vector with a limb and store the result
-dnl in a second limb vector.
-
-dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-C res_ptr r4
-C s1_ptr r5
-C size r6
-C s2_limb r7
-
-changecom(blah) C disable # to make all C comments below work
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- mov #0,r2 C cy_limb = 0
- mov #0,r0 C Keep r0 = 0 for entire loop
- clrt
-
-L(top): mov.l @r5+,r3
- dmulu.l r3,r7
- sts macl,r1
- addc r2,r1
- sts mach,r2
- addc r0,r2 C propagate carry to cy_limb (dt clobbers T)
- dt r6
- mov.l r1,@r4
- bf.s L(top)
- add #4,r4
-
- rts
- mov r2,r0
-EPILOGUE()
diff --git a/gmp/mpn/sh/sh2/mul_1.s b/gmp/mpn/sh/sh2/mul_1.s
new file mode 100644
index 0000000000..aa41bf2421
--- /dev/null
+++ b/gmp/mpn/sh/sh2/mul_1.s
@@ -0,0 +1,48 @@
+! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_mul_1
+___gmpn_mul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1
+ sts mach,r2
+ addc r0,r2 ! propagate carry to cy_limb (dt clobbers T)
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/gmp/mpn/sh/sh2/submul_1.asm b/gmp/mpn/sh/sh2/submul_1.asm
deleted file mode 100644
index bef2abd9b2..0000000000
--- a/gmp/mpn/sh/sh2/submul_1.asm
+++ /dev/null
@@ -1,65 +0,0 @@
-dnl SH2 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
-dnl result from a second limb vector.
-
-dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-C res_ptr r4
-C s1_ptr r5
-C size r6
-C s2_limb r7
-
-changecom(blah) C disable # to make all C comments below work
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- mov #0,r2 C cy_limb = 0
- mov #0,r0 C Keep r0 = 0 for entire loop
- clrt
-
-L(top): mov.l @r5+,r3
- dmulu.l r3,r7
- sts macl,r1
- addc r2,r1 C lo_prod += old cy_limb
- sts mach,r2 C new cy_limb = hi_prod
- mov.l @r4,r3
- addc r0,r2 C cy_limb += T, T = 0
- subc r1,r3
- addc r0,r2 C cy_limb += T, T = 0
- dt r6
- mov.l r3,@r4
- bf.s L(top)
- add #4,r4
-
- rts
- mov r2,r0
-EPILOGUE()
diff --git a/gmp/mpn/sh/sh2/submul_1.s b/gmp/mpn/sh/sh2/submul_1.s
new file mode 100644
index 0000000000..a1149c54fc
--- /dev/null
+++ b/gmp/mpn/sh/sh2/submul_1.s
@@ -0,0 +1,51 @@
+! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+! the result from a second limb vector.
+
+! Copyright 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! size r6
+! s2_limb r7
+
+ .text
+ .align 1
+ .global ___gmpn_submul_1
+___gmpn_submul_1:
+ mov #0,r2 ! cy_limb = 0
+ mov #0,r0 ! Keep r0 = 0 for entire loop
+ clrt
+
+Loop: mov.l @r5+,r3
+ dmulu.l r3,r7
+ sts macl,r1
+ addc r2,r1 ! lo_prod += old cy_limb
+ sts mach,r2 ! new cy_limb = hi_prod
+ mov.l @r4,r3
+ addc r0,r2 ! cy_limb += T, T = 0
+ subc r3,r1
+ addc r0,r2 ! cy_limb += T, T = 0
+ dt r6
+ mov.l r1,@r4
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r2,r0
diff --git a/gmp/mpn/sh/sub_n.asm b/gmp/mpn/sh/sub_n.asm
deleted file mode 100644
index 465bc806fa..0000000000
--- a/gmp/mpn/sh/sub_n.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-dnl SH mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
-dnl difference in a third limb vector.
-
-dnl Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-C rp r4
-C up r5
-C vp r6
-C n r7
-
-changecom(blah) C disable # to make all C comments below work
-
-ASM_START()
-PROLOGUE(mpn_sub_n)
- mov #0,r3 C clear cy save reg
-
-L(top): mov.l @r5+,r1
- mov.l @r6+,r2
- shlr r3 C restore cy
- subc r2,r1
- movt r3 C save cy
- mov.l r1,@r4
- dt r7
- bf.s L(top)
- add #4,r4
-
- rts
- mov r3,r0 C return carry-out from most significant limb
-EPILOGUE()
diff --git a/gmp/mpn/sh/sub_n.s b/gmp/mpn/sh/sub_n.s
new file mode 100644
index 0000000000..b06e09a727
--- /dev/null
+++ b/gmp/mpn/sh/sub_n.s
@@ -0,0 +1,45 @@
+! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+! difference in a third limb vector.
+
+! Copyright 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r4
+! s1_ptr r5
+! s2_ptr r6
+! size r7
+
+ .text
+ .align 2
+ .global ___gmpn_sub_n
+___gmpn_sub_n:
+ mov #0,r3 ! clear cy save reg
+
+Loop: mov.l @r5+,r1
+ mov.l @r6+,r2
+ shlr r3 ! restore cy
+ subc r2,r1
+ movt r3 ! save cy
+ mov.l r1,@r4
+ dt r7
+ bf.s Loop
+ add #4,r4
+
+ rts
+ mov r3,r0 ! return carry-out from most sign. limb
diff --git a/gmp/mpn/sparc32/README b/gmp/mpn/sparc32/README
index f2dd1160f7..825a1ace8e 100644
--- a/gmp/mpn/sparc32/README
+++ b/gmp/mpn/sparc32/README
@@ -3,28 +3,17 @@ Copyright 1996, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/sparc32/add_n.asm b/gmp/mpn/sparc32/add_n.asm
index 8549195d92..7c8a9c41e8 100644
--- a/gmp/mpn/sparc32/add_n.asm
+++ b/gmp/mpn/sparc32/add_n.asm
@@ -4,30 +4,19 @@ dnl sum in a third limb vector.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/addmul_1.asm b/gmp/mpn/sparc32/addmul_1.asm
index 92d5d78d51..d73529e304 100644
--- a/gmp/mpn/sparc32/addmul_1.asm
+++ b/gmp/mpn/sparc32/addmul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
dnl result to a second limb vector.
-dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/gmp-mparam.h b/gmp/mpn/sparc32/gmp-mparam.h
index a3bc612543..3bc6cd6db4 100644
--- a/gmp/mpn/sparc32/gmp-mparam.h
+++ b/gmp/mpn/sparc32/gmp-mparam.h
@@ -1,40 +1,30 @@
/* SPARC v7 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* Generated by tuneup.c, 2002-03-13, gcc 2.95, Weitek 8701 */
-#define MUL_TOOM22_THRESHOLD 8
-#define MUL_TOOM33_THRESHOLD 466
+#define MUL_KARATSUBA_THRESHOLD 8
+#define MUL_TOOM3_THRESHOLD 466
#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 16
+#define SQR_KARATSUBA_THRESHOLD 16
#define SQR_TOOM3_THRESHOLD 258
#define DIV_SB_PREINV_THRESHOLD 4
diff --git a/gmp/mpn/sparc32/lshift.asm b/gmp/mpn/sparc32/lshift.asm
index 8321343d6b..00004f87a7 100644
--- a/gmp/mpn/sparc32/lshift.asm
+++ b/gmp/mpn/sparc32/lshift.asm
@@ -3,30 +3,19 @@ dnl SPARC mpn_lshift -- Shift a number left.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/mul_1.asm b/gmp/mpn/sparc32/mul_1.asm
index 42b4168be1..147db11157 100644
--- a/gmp/mpn/sparc32/mul_1.asm
+++ b/gmp/mpn/sparc32/mul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
dnl the result in a second limb vector.
-dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/rshift.asm b/gmp/mpn/sparc32/rshift.asm
index e1554766fe..26db4419d8 100644
--- a/gmp/mpn/sparc32/rshift.asm
+++ b/gmp/mpn/sparc32/rshift.asm
@@ -3,30 +3,19 @@ dnl SPARC mpn_rshift -- Shift a number right.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/sparc-defs.m4 b/gmp/mpn/sparc32/sparc-defs.m4
index 5a0d425204..10a4a86a63 100644
--- a/gmp/mpn/sparc32/sparc-defs.m4
+++ b/gmp/mpn/sparc32/sparc-defs.m4
@@ -3,36 +3,25 @@ divert(-1)
dnl m4 macros for SPARC assembler (32 and 64 bit).
-dnl Copyright 2002, 2011, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-changecom(;) dnl cannot use default # since that's used in REGISTER decls
+changecom(!)
dnl Usage: REGISTER(reg,attr)
@@ -47,33 +36,4 @@ m4_assert_defined(`HAVE_REGISTER')
`.register `$1',`$2'')')
-C Testing mechanism for running newer code on older processors
-ifdef(`FAKE_T3',`
- include_mpn(`sparc64/ultrasparct3/missing.m4')
-',`
- define(`addxccc', ``addxccc' $1, $2, $3')
- define(`addxc', ``addxc' $1, $2, $3')
- define(`umulxhi', ``umulxhi' $1, $2, $3')
- define(`lzcnt', ``lzd' $1, $2')
-')
-
-dnl Usage: LEA64(symbol,reg,pic_reg)
-dnl
-dnl Use whatever 64-bit code sequence is appropriate to load "symbol" into
-dnl register "reg", potentially using register "pic_reg" to perform the
-dnl calculations.
-
-define(LEA64,
-m4_assert_numargs(3)
-m4_assert_defined(`HAVE_GOTDATA')
-`ifdef(`PIC',`
- rd %pc, %`$2'
- sethi %hi(_GLOBAL_OFFSET_TABLE_+4), %`$3'
- add %`$3', %lo(_GLOBAL_OFFSET_TABLE_+8), %`$3'
- add %`$2', %`$3', %`$3'
- sethi %hi(`$1'), %`$2'
- or %`$2', %lo(`$1'), %`$2'
- ldx [%`$3' + %`$2'], %`$2'',`
- setx `$1', %`$3', %`$2'')')
-
divert
diff --git a/gmp/mpn/sparc32/sub_n.asm b/gmp/mpn/sparc32/sub_n.asm
index 24a576d82b..4fc759dcbf 100644
--- a/gmp/mpn/sparc32/sub_n.asm
+++ b/gmp/mpn/sparc32/sub_n.asm
@@ -4,30 +4,19 @@ dnl store difference in a third limb vector.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/submul_1.asm b/gmp/mpn/sparc32/submul_1.asm
index 73f9377006..9cde45fc3b 100644
--- a/gmp/mpn/sparc32/submul_1.asm
+++ b/gmp/mpn/sparc32/submul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
dnl the result from a second limb vector.
-dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/udiv.asm b/gmp/mpn/sparc32/udiv.asm
index 23ab3de1db..fc520f67e2 100644
--- a/gmp/mpn/sparc32/udiv.asm
+++ b/gmp/mpn/sparc32/udiv.asm
@@ -4,30 +4,19 @@ dnl This is for v7 CPUs with a floating-point unit.
dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/udiv_nfp.asm b/gmp/mpn/sparc32/udiv_nfp.asm
index ebbb820639..a7513ead6c 100644
--- a/gmp/mpn/sparc32/udiv_nfp.asm
+++ b/gmp/mpn/sparc32/udiv_nfp.asm
@@ -4,30 +4,19 @@ dnl This is for v7 CPUs without a floating-point unit.
dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/ultrasparct1/add_n.asm b/gmp/mpn/sparc32/ultrasparct1/add_n.asm
deleted file mode 100644
index c781596dad..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/add_n.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-dnl SPARC T1 32-bit mpn_add_n.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-define(`rp', %o0)
-define(`ap', %o1)
-define(`bp', %o2)
-define(`n', %o3)
-define(`cy', %o4)
-
-define(`i', %o3)
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
-
-ASM_START()
-PROLOGUE(mpn_add_nc)
- b L(ent)
- srl cy, 0, cy C strip any bogus high bits
-EPILOGUE()
-
-PROLOGUE(mpn_add_n)
- mov 0, cy
-L(ent): srl n, 0, n C strip any bogus high bits
- sll n, 2, n
- add ap, n, ap
- add bp, n, bp
- add rp, n, rp
- neg n, i
-
-L(top): lduw [ap+i], %g1
- lduw [bp+i], %g2
- add %g1, %g2, %g3
- add %g3, cy, %g3
- stw %g3, [rp+i]
- add i, 4, i
- brnz i, L(top)
- srlx %g3, 32, cy
-
- retl
- mov cy, %o0 C return value
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm b/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm
deleted file mode 100644
index 89da186457..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm
+++ /dev/null
@@ -1,90 +0,0 @@
-dnl SPARC T1 32-bit mpn_addmul_1.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 24
-C UltraSPARC T2: 19
-C UltraSPARC T3: 19
-C UltraSPARC T4: 5
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- save %sp, -96, %sp
- srl n, 0, %o4
- srl v0, 0, %g1
- subcc %o4, 1, %o4
- be L(final_one)
- clr %o5
-
-L(top): lduw [up+0], %l0
- lduw [rp+0], %l2
- lduw [up+4], %l1
- lduw [rp+4], %l3
- mulx %l0, %g1, %g3
- add up, 8, up
- mulx %l1, %g1, %o3
- sub %o4, 2, %o4
- add rp, 8, rp
- add %l2, %g3, %g3
- add %o5, %g3, %g3
- stw %g3, [rp-8]
- srlx %g3, 32, %o5
- add %l3, %o3, %o3
- add %o5, %o3, %o3
- stw %o3, [rp-4]
- brgz %o4, L(top)
- srlx %o3, 32, %o5
-
- brlz,pt %o4, L(done)
- nop
-
-L(final_one):
- lduw [up+0], %l0
- lduw [rp+0], %l2
- mulx %l0, %g1, %g3
- add %l2, %g3, %g3
- add %o5, %g3, %g3
- stw %g3, [rp+0]
- srlx %g3, 32, %o5
-
-L(done):
- ret
- restore %o5, 0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h b/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h
deleted file mode 100644
index 6f9d5a44ca..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* UltraSPARC T 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 3
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 10
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 35
-
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 98
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 226
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120
-
-#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 110
-#define SQR_TOOM4_THRESHOLD 178
-#define SQR_TOOM6_THRESHOLD 240
-#define SQR_TOOM8_THRESHOLD 333
-
-#define MULMID_TOOM42_THRESHOLD 22
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 280 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 280, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \
- { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 21, 8}, \
- { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
- { 33, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \
- { 49, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47,10}, { 31, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255,10}, { 79, 9}, { 159, 8}, { 319,10}, \
- { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
- { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 66
-#define MUL_FFT_THRESHOLD 3712
-
-#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 240, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
- { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 39, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \
- { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \
- { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143,10}, { 79, 9}, { 159, 8}, \
- { 319, 9}, { 175,10}, { 95, 9}, { 191, 8}, \
- { 383, 9}, { 207,11}, { 63,10}, { 127, 9}, \
- { 255,10}, { 143, 9}, { 287,10}, { 159, 9}, \
- { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 70
-#define SQR_FFT_THRESHOLD 2624
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 51
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 51
-#define DC_DIVAPPR_Q_THRESHOLD 202
-#define DC_BDIV_QR_THRESHOLD 47
-#define DC_BDIV_Q_THRESHOLD 124
-
-#define INV_MULMOD_BNM1_THRESHOLD 26
-#define INV_NEWTON_THRESHOLD 266
-#define INV_APPR_THRESHOLD 222
-
-#define BINV_NEWTON_THRESHOLD 296
-#define REDC_1_TO_REDC_N_THRESHOLD 59
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1499
-#define MUPI_DIV_QR_THRESHOLD 116
-#define MU_BDIV_QR_THRESHOLD 1057
-#define MU_BDIV_Q_THRESHOLD 1334
-
-#define POWM_SEC_TABLE 6,35,213,724,2618
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 84
-#define HGCD_APPR_THRESHOLD 101
-#define HGCD_REDUCE_THRESHOLD 1437
-#define GCD_DC_THRESHOLD 372
-#define GCDEXT_DC_THRESHOLD 253
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 399
-#define SET_STR_PRECOMPUTE_THRESHOLD 885
-
-#define FAC_DSC_THRESHOLD 179
-#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/sparc32/ultrasparct1/mul_1.asm b/gmp/mpn/sparc32/ultrasparct1/mul_1.asm
deleted file mode 100644
index 0239cd28cd..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/mul_1.asm
+++ /dev/null
@@ -1,83 +0,0 @@
-dnl SPARC T1 32-bit mpn_mul_1.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 20
-C UltraSPARC T2: 18
-C UltraSPARC T3: 18
-C UltraSPARC T4: 4
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`n', `%o2')
-define(`v0', `%o3')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- srl n, 0, n
- srl v0, 0, v0
- subcc n, 1, n
- be L(final_one)
- clr %o5
-
-L(top): lduw [up+0], %g1
- lduw [up+4], %g2
- mulx %g1, v0, %g3
- add up, 8, up
- mulx %g2, v0, %o4
- sub n, 2, n
- add rp, 8, rp
- add %o5, %g3, %g3
- stw %g3, [rp-8]
- srlx %g3, 32, %o5
- add %o5, %o4, %o4
- stw %o4, [rp-4]
- brgz n, L(top)
- srlx %o4, 32, %o5
-
- brlz,pt n, L(done)
- nop
-
-L(final_one):
- lduw [up+0], %g1
- mulx %g1, v0, %g3
- add %o5, %g3, %g3
- stw %g3, [rp+0]
- srlx %g3, 32, %o5
-
-L(done):
- retl
- mov %o5, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm b/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm
deleted file mode 100644
index 3b906ef202..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm
+++ /dev/null
@@ -1,55 +0,0 @@
-dnl SPARC T1 32-bit mpn_sqr_diagonal.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`n', `%o2')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diagonal)
- deccc n C n--
- nop
-
-L(top): lduw [up+0], %g1
- add up, 4, up C up++
- mulx %g1, %g1, %g3
- stw %g3, [rp+0]
- srlx %g3, 32, %g4
- stw %g4, [rp+4]
- add rp, 8, rp C rp += 2
- bnz %icc, L(top)
- deccc n C n--
-
- retl
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/ultrasparct1/sub_n.asm b/gmp/mpn/sparc32/ultrasparct1/sub_n.asm
deleted file mode 100644
index 946bc3ff8e..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/sub_n.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-dnl SPARC T1 32-bit mpn_sub_n.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-define(`rp', %o0)
-define(`ap', %o1)
-define(`bp', %o2)
-define(`n', %o3)
-define(`cy', %o4)
-
-define(`i', %o3)
-
-MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
-
-ASM_START()
-PROLOGUE(mpn_sub_nc)
- b L(ent)
- srl cy, 0, cy C strip any bogus high bits
-EPILOGUE()
-
-PROLOGUE(mpn_sub_n)
- mov 0, cy
-L(ent): srl n, 0, n C strip any bogus high bits
- sll n, 2, n
- add ap, n, ap
- add bp, n, bp
- add rp, n, rp
- neg n, i
-
-L(top): lduw [ap+i], %g1
- lduw [bp+i], %g2
- sub %g1, %g2, %g3
- sub %g3, cy, %g3
- stw %g3, [rp+i]
- add i, 4, i
- brnz i, L(top)
- srlx %g3, 63, cy
-
- retl
- mov cy, %o0 C return value
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/ultrasparct1/submul_1.asm b/gmp/mpn/sparc32/ultrasparct1/submul_1.asm
deleted file mode 100644
index 89200709c4..0000000000
--- a/gmp/mpn/sparc32/ultrasparct1/submul_1.asm
+++ /dev/null
@@ -1,91 +0,0 @@
-dnl SPARC T1 32-bit mpn_submul_1.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2010, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 24
-C UltraSPARC T2: 19
-C UltraSPARC T3: 19
-C UltraSPARC T4: 5
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- save %sp, -96, %sp
- srl n, 0, %o4
- srl v0, 0, %g1
- subcc %o4, 1, %o4
- be L(final_one)
- subcc %g0, 0, %o5
-
-L(top): lduw [up+0], %l0
- lduw [rp+0], %l2
- lduw [up+4], %l1
- lduw [rp+4], %l3
- mulx %l0, %g1, %g3
- add up, 8, up
- mulx %l1, %g1, %o3
- sub %o4, 2, %o4
- add rp, 8, rp
- addx %o5, %g3, %g3
- srlx %g3, 32, %o5
- subcc %l2, %g3, %g3
- stw %g3, [rp-8]
- addx %o5, %o3, %o3
- srlx %o3, 32, %o5
- subcc %l3, %o3, %o3
- brgz %o4, L(top)
- stw %o3, [rp-4]
-
- brlz,pt %o4, L(done)
- nop
-
-L(final_one):
- lduw [up+0], %l0
- lduw [rp+0], %l2
- mulx %l0, %g1, %g3
- addx %o5, %g3, %g3
- srlx %g3, 32, %o5
- subcc %l2, %g3, %g3
- stw %g3, [rp+0]
-
-L(done):
- addx %o5, 0, %o5
- ret
- restore %o5, 0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc32/umul.asm b/gmp/mpn/sparc32/umul.asm
index 3a20b95cb5..80c82122d9 100644
--- a/gmp/mpn/sparc32/umul.asm
+++ b/gmp/mpn/sparc32/umul.asm
@@ -3,30 +3,19 @@ dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/addmul_1.asm b/gmp/mpn/sparc32/v8/addmul_1.asm
index 0052092784..6e5e78865b 100644
--- a/gmp/mpn/sparc32/v8/addmul_1.asm
+++ b/gmp/mpn/sparc32/v8/addmul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
dnl add the result to a second limb vector.
-dnl Copyright 1992-1995, 2000 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/gmp-mparam.h b/gmp/mpn/sparc32/v8/gmp-mparam.h
index e57897b439..f042c19e5e 100644
--- a/gmp/mpn/sparc32/v8/gmp-mparam.h
+++ b/gmp/mpn/sparc32/v8/gmp-mparam.h
@@ -1,44 +1,34 @@
/* SPARC v8 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* Generated by tuneup.c, 2004-02-07, gcc 2.95 */
-#define MUL_TOOM22_THRESHOLD 10
-#define MUL_TOOM33_THRESHOLD 65
+#define MUL_KARATSUBA_THRESHOLD 10
+#define MUL_TOOM3_THRESHOLD 65
#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 18
+#define SQR_KARATSUBA_THRESHOLD 18
#define SQR_TOOM3_THRESHOLD 65
#define DIV_SB_PREINV_THRESHOLD 5
diff --git a/gmp/mpn/sparc32/v8/mul_1.asm b/gmp/mpn/sparc32/v8/mul_1.asm
index e26c853aed..d428debf15 100644
--- a/gmp/mpn/sparc32/v8/mul_1.asm
+++ b/gmp/mpn/sparc32/v8/mul_1.asm
@@ -4,30 +4,19 @@ dnl store the product in a second limb vector.
dnl Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/submul_1.asm b/gmp/mpn/sparc32/v8/submul_1.asm
index 187314ecef..4dde012808 100644
--- a/gmp/mpn/sparc32/v8/submul_1.asm
+++ b/gmp/mpn/sparc32/v8/submul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+dnl Copyright 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h b/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h
index 1ac9239e3c..feb90ef408 100644
--- a/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h
+++ b/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h
@@ -1,44 +1,34 @@
/* SuperSPARC gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* Generated by tuneup.c, 2004-02-10, gcc 3.3 */
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 81
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 81
#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 28
+#define SQR_KARATSUBA_THRESHOLD 28
#define SQR_TOOM3_THRESHOLD 86
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/gmp/mpn/sparc32/v8/supersparc/udiv.asm
index 12f66ce6a2..2ce3b8f15c 100644
--- a/gmp/mpn/sparc32/v8/supersparc/udiv.asm
+++ b/gmp/mpn/sparc32/v8/supersparc/udiv.asm
@@ -5,30 +5,19 @@ dnl udiv instruction.
dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/udiv.asm b/gmp/mpn/sparc32/v8/udiv.asm
index 12f66ce6a2..2ce3b8f15c 100644
--- a/gmp/mpn/sparc32/v8/udiv.asm
+++ b/gmp/mpn/sparc32/v8/udiv.asm
@@ -5,30 +5,19 @@ dnl udiv instruction.
dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v8/umul.asm b/gmp/mpn/sparc32/v8/umul.asm
index 1a2e84b1f6..569a4e8dd3 100644
--- a/gmp/mpn/sparc32/v8/umul.asm
+++ b/gmp/mpn/sparc32/v8/umul.asm
@@ -3,30 +3,19 @@ dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/add_n.asm b/gmp/mpn/sparc32/v9/add_n.asm
index 7bd5974fd3..a21cf10d55 100644
--- a/gmp/mpn/sparc32/v9/add_n.asm
+++ b/gmp/mpn/sparc32/v9/add_n.asm
@@ -4,30 +4,19 @@ dnl sum in a third limb vector.
dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/addmul_1.asm b/gmp/mpn/sparc32/v9/addmul_1.asm
index 2adf7a8a2f..18b9a72d1a 100644
--- a/gmp/mpn/sparc32/v9/addmul_1.asm
+++ b/gmp/mpn/sparc32/v9/addmul_1.asm
@@ -4,30 +4,19 @@ dnl the result to a second limb vector.
dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/gmp-mparam.h b/gmp/mpn/sparc32/v9/gmp-mparam.h
index f909e2cf18..5d06398192 100644
--- a/gmp/mpn/sparc32/v9/gmp-mparam.h
+++ b/gmp/mpn/sparc32/v9/gmp-mparam.h
@@ -1,204 +1,73 @@
/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2009 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */
-
-#define DIVREM_1_NORM_THRESHOLD 3
-#define DIVREM_1_UNNORM_THRESHOLD 4
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 13
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 32
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 4
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 43
-#define MUL_TOOM44_THRESHOLD 126
-#define MUL_TOOM6H_THRESHOLD 161
-#define MUL_TOOM8H_THRESHOLD 208
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 80
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 55
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 72
-
-#define SQR_BASECASE_THRESHOLD 4
-#define SQR_TOOM2_THRESHOLD 64
-#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 152
-#define SQR_TOOM6_THRESHOLD 185
-#define SQR_TOOM8_THRESHOLD 324
-
-#define MULMID_TOOM42_THRESHOLD 64
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 16
-
-#define MUL_FFT_MODF_THRESHOLD 288 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 288, 5}, { 9, 4}, { 19, 5}, { 11, 6}, \
- { 6, 5}, { 14, 6}, { 8, 5}, { 17, 6}, \
- { 9, 5}, { 20, 6}, { 13, 7}, { 7, 6}, \
- { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
- { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \
- { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \
- { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \
- { 39, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47,10}, { 31, 9}, { 71, 8}, \
- { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \
- { 143, 8}, { 287,10}, { 79, 9}, { 175,10}, \
- { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
- { 63,10}, { 143, 9}, { 287, 8}, { 575,10}, \
- { 175,11}, { 95,10}, { 191, 9}, { 415, 8}, \
- { 831,12}, { 63,11}, { 127,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
- { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447, 9}, { 895, 8}, { 1791,12}, { 127,11}, \
- { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \
- { 319, 9}, { 1279,11}, { 351,12}, { 191,11}, \
- { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \
- { 1791,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
- { 703,12}, { 383,11}, { 831,12}, { 447,11}, \
- { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \
- { 575,11}, { 1215,10}, { 2431,12}, { 703,13}, \
- { 383,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1087,11}, { 2175,12}, { 1215,11}, { 2431,13}, \
- { 639,12}, { 1407,11}, { 2943,13}, { 895,12}, \
- { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \
- { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \
- { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 143
-#define MUL_FFT_THRESHOLD 2240
-
-#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 244, 5}, { 8, 4}, { 17, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
- { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
- { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \
- { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
- { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 71, 8}, { 143, 7}, { 287, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143, 8}, { 287,10}, { 79, 9}, \
- { 159, 8}, { 319, 9}, { 175, 8}, { 351, 7}, \
- { 703,10}, { 95, 9}, { 191, 8}, { 383, 9}, \
- { 207, 8}, { 415, 9}, { 223,11}, { 63,10}, \
- { 127, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \
- { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207, 9}, { 415, 8}, { 831,10}, \
- { 223,12}, { 63,11}, { 127,10}, { 271, 9}, \
- { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 351, 9}, { 703, 8}, \
- { 1407,11}, { 191,10}, { 415, 9}, { 831,11}, \
- { 223,10}, { 447, 9}, { 895,10}, { 479,12}, \
- { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
- { 575,11}, { 319,10}, { 639,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 415,10}, { 831,11}, \
- { 447,10}, { 895, 9}, { 1791,13}, { 127,12}, \
- { 255,11}, { 575,12}, { 319,11}, { 703,10}, \
- { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \
- { 959,10}, { 1919, 9}, { 3839,13}, { 255,12}, \
- { 575,11}, { 1151,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1407,13}, \
- { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \
- { 511,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
- { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \
- { 1919,15}, { 511,14}, { 1023,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 153
-#define SQR_FFT_THRESHOLD 2112
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 144
-#define MULLO_MUL_N_THRESHOLD 4292
-
-#define DC_DIV_QR_THRESHOLD 74
-#define DC_DIVAPPR_Q_THRESHOLD 406
-#define DC_BDIV_QR_THRESHOLD 63
-#define DC_BDIV_Q_THRESHOLD 363
-
-#define INV_MULMOD_BNM1_THRESHOLD 108
-#define INV_NEWTON_THRESHOLD 351
-#define INV_APPR_THRESHOLD 303
-
-#define BINV_NEWTON_THRESHOLD 354
-#define REDC_1_TO_REDC_N_THRESHOLD 61
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 1099
-#define MUPI_DIV_QR_THRESHOLD 118
-#define MU_BDIV_QR_THRESHOLD 807
-#define MU_BDIV_Q_THRESHOLD 979
-
-#define POWM_SEC_TABLE 3,22,127,624,779,2351
-
-#define MATRIX22_STRASSEN_THRESHOLD 7
-#define HGCD_THRESHOLD 90
-#define HGCD_APPR_THRESHOLD 123
-#define HGCD_REDUCE_THRESHOLD 1494
-#define GCD_DC_THRESHOLD 283
-#define GCDEXT_DC_THRESHOLD 192
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 290
-#define SET_STR_PRECOMPUTE_THRESHOLD 634
-
-#define FAC_DSC_THRESHOLD 156
-#define FAC_ODD_THRESHOLD 25
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2009-02-12, gcc 2.95 */
+
+#define MUL_KARATSUBA_THRESHOLD 28
+#define MUL_TOOM3_THRESHOLD 97
+#define MUL_TOOM44_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 8
+#define SQR_KARATSUBA_THRESHOLD 60
+#define SQR_TOOM3_THRESHOLD 138
+#define SQR_TOOM4_THRESHOLD 278
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 111
+#define MULLOW_MUL_N_THRESHOLD 434
+
+#define DIV_SB_PREINV_THRESHOLD 7
+#define DIV_DC_THRESHOLD 122
+#define POWM_THRESHOLD 154
+
+#define MATRIX22_STRASSEN_THRESHOLD 12
+#define HGCD_THRESHOLD 155
+#define GCD_DC_THRESHOLD 614
+#define GCDEXT_DC_THRESHOLD 438
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 5
+#define DIVREM_1_UNNORM_THRESHOLD 14
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1_1_THRESHOLD 7
+#define MOD_1_2_THRESHOLD 8
+#define MOD_1_4_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 802
+#define SET_STR_PRECOMPUTE_THRESHOLD 1647
+
+#define MUL_FFT_TABLE { 304, 736, 1152, 3584, 10240, 24576, 98304, 393216, 0 }
+#define MUL_FFT_MODF_THRESHOLD 264
+#define MUL_FFT_THRESHOLD 2304
+
+#define SQR_FFT_TABLE { 336, 800, 1408, 3584, 10240, 24576, 98304, 393216, 0 }
+#define SQR_FFT_MODF_THRESHOLD 248
+#define SQR_FFT_THRESHOLD 2304
diff --git a/gmp/mpn/sparc32/v9/mul_1.asm b/gmp/mpn/sparc32/v9/mul_1.asm
index 40aeffad4f..881f46fb62 100644
--- a/gmp/mpn/sparc32/v9/mul_1.asm
+++ b/gmp/mpn/sparc32/v9/mul_1.asm
@@ -4,30 +4,19 @@ dnl the result in a second limb vector.
dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/sqr_diagonal.asm b/gmp/mpn/sparc32/v9/sqr_diagonal.asm
index e024279849..e4a78c5de7 100644
--- a/gmp/mpn/sparc32/v9/sqr_diagonal.asm
+++ b/gmp/mpn/sparc32/v9/sqr_diagonal.asm
@@ -3,30 +3,19 @@ dnl SPARC v9 32-bit mpn_sqr_diagonal.
dnl Copyright 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/sub_n.asm b/gmp/mpn/sparc32/v9/sub_n.asm
index 636c73bf35..cea474326c 100644
--- a/gmp/mpn/sparc32/v9/sub_n.asm
+++ b/gmp/mpn/sparc32/v9/sub_n.asm
@@ -4,30 +4,19 @@ dnl store difference in a third limb vector.
dnl Copyright 2001 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/submul_1.asm b/gmp/mpn/sparc32/v9/submul_1.asm
index 92d0ce7db9..e5823b1e4b 100644
--- a/gmp/mpn/sparc32/v9/submul_1.asm
+++ b/gmp/mpn/sparc32/v9/submul_1.asm
@@ -4,30 +4,19 @@ dnl subtract the result from a second limb vector.
dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc32/v9/udiv.asm b/gmp/mpn/sparc32/v9/udiv.asm
index 61dde97a66..0957b8c225 100644
--- a/gmp/mpn/sparc32/v9/udiv.asm
+++ b/gmp/mpn/sparc32/v9/udiv.asm
@@ -3,30 +3,19 @@ dnl SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc64/README b/gmp/mpn/sparc64/README
index e2c051a02b..19072996de 100644
--- a/gmp/mpn/sparc64/README
+++ b/gmp/mpn/sparc64/README
@@ -1,30 +1,19 @@
-Copyright 1997, 1999-2002 Free Software Foundation, Inc.
+Copyright 1997, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -76,7 +65,7 @@ Integer conditional move instructions cannot dual-issue with other integer
instructions. No conditional move can issue 1-5 cycles after a load. (This
might have been fixed for UltraSPARC-3.)
-The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is
+The UltraSPARC-3 pipeline is very simular to he one of UltraSPARC-1/2 , but is
somewhat slower. Branches execute slower, and there may be other new stalls.
But integer multiply doesn't stall the entire CPU and also has a much lower
latency. But it's still not pipelined, and thus useless for our needs.
diff --git a/gmp/mpn/sparc64/ultrasparc1234/add_n.asm b/gmp/mpn/sparc64/add_n.asm
index 92374d2552..c3e5b46ddf 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/add_n.asm
+++ b/gmp/mpn/sparc64/add_n.asm
@@ -1,33 +1,22 @@
dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
dnl store sum in a third limb vector.
-dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -43,19 +32,19 @@ C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
C Therefore, it seems futile to try to optimize this any further...
C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`vp', `%i2')
-define(`n', `%i3')
-
-define(`u0', `%l0')
-define(`u1', `%l2')
-define(`u2', `%l4')
-define(`u3', `%l6')
-define(`v0', `%l1')
-define(`v1', `%l3')
-define(`v2', `%l5')
-define(`v3', `%l7')
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`vp',`%i2')
+define(`n',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+define(`v0',`%l1')
+define(`v1',`%l3')
+define(`v2',`%l5')
+define(`v3',`%l7')
define(`cy',`%i4')
@@ -65,24 +54,14 @@ define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
-PROLOGUE(mpn_add_nc)
- save %sp,-160,%sp
-
- fitod %f0,%f0 C make sure f0 contains small, quiet number
- subcc n,4,%g0
- bl,pn %xcc,.Loop0
- nop
- b,a L(com)
-EPILOGUE()
-
PROLOGUE(mpn_add_n)
save %sp,-160,%sp
fitod %f0,%f0 C make sure f0 contains small, quiet number
subcc n,4,%g0
- bl,pn %xcc,.Loop0
+ bl,pn %icc,.Loop0
mov 0,cy
-L(com):
+
ldx [up+0],u0
ldx [vp+0],v0
add up,32,up
@@ -95,15 +74,15 @@ L(com):
ldx [vp-8],v3
subcc n,8,n
add u0,v0,%g1 C main add
- add %g1,cy,%g5 C carry add
+ add %g1,cy,%g4 C carry add
or u0,v0,%g2
- bl,pn %xcc,.Lend4567
+ bl,pn %icc,.Lend4567
fanop
b,a .Loop
.align 16
C START MAIN LOOP
-.Loop: andn %g2,%g5,%g2
+.Loop: andn %g2,%g4,%g2
and u0,v0,%g3
ldx [up+0],u0
fanop
@@ -115,15 +94,15 @@ C --
C --
srlx %g2,63,cy
add u1,v1,%g1
- stx %g5,[rp+0]
+ stx %g4,[rp+0]
fanop
C --
- add %g1,cy,%g5
+ add %g1,cy,%g4
or u1,v1,%g2
fmnop
fanop
C --
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u1,v1,%g3
ldx [up-24],u1
fanop
@@ -135,15 +114,15 @@ C --
C --
srlx %g2,63,cy
add u2,v2,%g1
- stx %g5,[rp+8]
+ stx %g4,[rp+8]
fanop
C --
- add %g1,cy,%g5
+ add %g1,cy,%g4
or u2,v2,%g2
fmnop
fanop
C --
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u2,v2,%g3
ldx [up-16],u2
fanop
@@ -155,15 +134,15 @@ C --
C --
srlx %g2,63,cy
add u3,v3,%g1
- stx %g5,[rp-16]
+ stx %g4,[rp-16]
fanop
C --
- add %g1,cy,%g5
+ add %g1,cy,%g4
or u3,v3,%g2
fmnop
fanop
C --
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u3,v3,%g3
ldx [up-8],u3
fanop
@@ -175,48 +154,48 @@ C --
C --
srlx %g2,63,cy
add u0,v0,%g1
- stx %g5,[rp-8]
+ stx %g4,[rp-8]
fanop
C --
- add %g1,cy,%g5
+ add %g1,cy,%g4
or u0,v0,%g2
- bge,pt %xcc,.Loop
+ bge,pt %icc,.Loop
fanop
C END MAIN LOOP
.Lend4567:
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u0,v0,%g3
or %g3,%g2,%g2
srlx %g2,63,cy
add u1,v1,%g1
- stx %g5,[rp+0]
- add %g1,cy,%g5
+ stx %g4,[rp+0]
+ add %g1,cy,%g4
or u1,v1,%g2
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u1,v1,%g3
or %g3,%g2,%g2
srlx %g2,63,cy
add u2,v2,%g1
- stx %g5,[rp+8]
- add %g1,cy,%g5
+ stx %g4,[rp+8]
+ add %g1,cy,%g4
or u2,v2,%g2
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u2,v2,%g3
or %g3,%g2,%g2
add rp,32,rp
srlx %g2,63,cy
add u3,v3,%g1
- stx %g5,[rp-16]
- add %g1,cy,%g5
+ stx %g4,[rp-16]
+ add %g1,cy,%g4
or u3,v3,%g2
- andn %g2,%g5,%g2
+ andn %g2,%g4,%g2
and u3,v3,%g3
or %g3,%g2,%g2
srlx %g2,63,cy
- stx %g5,[rp-8]
+ stx %g4,[rp-8]
addcc n,4,n
- bz,pn %xcc,.Lret
+ bz,pn %icc,.Lret
fanop
.Loop0: ldx [up],u0
@@ -227,15 +206,15 @@ C END MAIN LOOP
subcc n,1,n
add u0,v0,%g1
or u0,v0,%g2
- add %g1,cy,%g5
+ add %g1,cy,%g4
and u0,v0,%g3
- andn %g2,%g5,%g2
- stx %g5,[rp-8]
+ andn %g2,%g4,%g2
+ stx %g4,[rp-8]
or %g3,%g2,%g2
- bnz,pt %xcc,.Loop0
+ bnz,pt %icc,.Loop0
srlx %g2,63,cy
.Lret: mov cy,%i0
ret
restore
-EPILOGUE()
+EPILOGUE(mpn_add_n)
diff --git a/gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm b/gmp/mpn/sparc64/addmul_1.asm
index 48a94146ff..bd83c6562c 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm
+++ b/gmp/mpn/sparc64/addmul_1.asm
@@ -1,33 +1,23 @@
dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
dnl the result to a second limb vector.
-dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc.
+dnl Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
+dnl Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -148,7 +138,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
fmuld u32, v00, r32
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_two_or_more
+ bnz,pt %icc, .L_two_or_more
fmuld u32, v16, r48
.L_one:
@@ -226,7 +216,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
faddd p16, r80, a16
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_three_or_more
+ bnz,pt %icc, .L_three_or_more
fmuld u32, v16, r48
.L_two:
@@ -308,7 +298,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
faddd p16, r80, a16
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_four_or_more
+ bnz,pt %icc, .L_four_or_more
fmuld u32, v16, r48
.L_three:
@@ -396,7 +386,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
fmuld u00, v48, p48
add cy, %g5, %o4 C x = prev(i00) + cy
addcc %i2, 8, %i2
- bnz,pt %xcc, .Loop
+ bnz,pt %icc, .Loop
fmuld u32, v16, r48
.L_four:
@@ -473,7 +463,7 @@ C 12
C 13
add cy, %g5, %o4 C x = prev(i00) + cy
addcc %i2, 8, %i2
- bnz,pt %xcc, .Loop
+ bnz,pt %icc, .Loop
fmuld u32, v16, r48
C END MAIN LOOP
diff --git a/gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm b/gmp/mpn/sparc64/addmul_2.asm
index 37674d7423..65efb5159a 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm
+++ b/gmp/mpn/sparc64/addmul_2.asm
@@ -4,30 +4,19 @@ dnl number and add the result to a n limb vector.
dnl Copyright 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc64/copyd.asm b/gmp/mpn/sparc64/copyd.asm
index ab105d39c7..8a73dba8f0 100644
--- a/gmp/mpn/sparc64/copyd.asm
+++ b/gmp/mpn/sparc64/copyd.asm
@@ -1,41 +1,27 @@
dnl SPARC v9 mpn_copyd -- Copy a limb vector, decrementing.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC 1&2: 2
-C UltraSPARC 3: 2.5
-C UltraSPARC T1: 17
-C UltraSPARC T3: 6
-C UltraSPARC T4/T5: 2
+C UltraSPARC 1&2: 2
+C UltraSPARC 3: 2.5
C INPUT PARAMETERS
C rptr %o0
@@ -50,7 +36,7 @@ PROLOGUE(mpn_copyd)
add %g1,%o0,%o0
add %g1,%o1,%o1
addcc %o2,-8,%o2
- bl,pt %xcc,L(end01234567)
+ bl,pt %icc,L(end01234567)
nop
L(loop1):
ldx [%o1-8],%g1
@@ -71,18 +57,18 @@ L(loop1):
stx %o4,[%o0-56]
stx %o5,[%o0-64]
addcc %o2,-8,%o2
- bge,pt %xcc,L(loop1)
+ bge,pt %icc,L(loop1)
add %o0,-64,%o0
L(end01234567):
addcc %o2,8,%o2
- bz,pn %xcc,L(end)
+ bz,pn %icc,L(end)
nop
L(loop2):
ldx [%o1-8],%g1
add %o1,-8,%o1
addcc %o2,-1,%o2
stx %g1,[%o0-8]
- bg,pt %xcc,L(loop2)
+ bg,pt %icc,L(loop2)
add %o0,-8,%o0
L(end): retl
nop
diff --git a/gmp/mpn/sparc64/copyi.asm b/gmp/mpn/sparc64/copyi.asm
index 45663dc2a3..3158357c0b 100644
--- a/gmp/mpn/sparc64/copyi.asm
+++ b/gmp/mpn/sparc64/copyi.asm
@@ -1,41 +1,27 @@
dnl SPARC v9 mpn_copyi -- Copy a limb vector, incrementing.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC 1&2: 2
-C UltraSPARC 3: 2.5
-C UltraSPARC T1: 17
-C UltraSPARC T3: 6
-C UltraSPARC T4/T5: 2
+C UltraSPARC 1&2: 2
+C UltraSPARC 3: 2.5
C INPUT PARAMETERS
C rptr %o0
@@ -47,7 +33,7 @@ ASM_START()
REGISTER(%g3,#scratch)
PROLOGUE(mpn_copyi)
addcc %o2,-8,%o2
- bl,pt %xcc,L(end01234567)
+ bl,pt %icc,L(end01234567)
nop
L(loop1):
ldx [%o1+0],%g1
@@ -68,18 +54,18 @@ L(loop1):
stx %o4,[%o0+48]
stx %o5,[%o0+56]
addcc %o2,-8,%o2
- bge,pt %xcc,L(loop1)
+ bge,pt %icc,L(loop1)
add %o0,64,%o0
L(end01234567):
addcc %o2,8,%o2
- bz,pn %xcc,L(end)
+ bz,pn %icc,L(end)
nop
L(loop2):
ldx [%o1+0],%g1
add %o1,8,%o1
addcc %o2,-1,%o2
stx %g1,[%o0+0]
- bg,pt %xcc,L(loop2)
+ bg,pt %icc,L(loop2)
add %o0,8,%o0
L(end): retl
nop
diff --git a/gmp/mpn/sparc64/dive_1.c b/gmp/mpn/sparc64/dive_1.c
index c3fbf01b14..6f3d7c447c 100644
--- a/gmp/mpn/sparc64/dive_1.c
+++ b/gmp/mpn/sparc64/dive_1.c
@@ -9,28 +9,17 @@ Copyright 2000, 2001, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/sparc64/divrem_1.c b/gmp/mpn/sparc64/divrem_1.c
index 531494a94f..06de9a6040 100644
--- a/gmp/mpn/sparc64/divrem_1.c
+++ b/gmp/mpn/sparc64/divrem_1.c
@@ -1,33 +1,22 @@
/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division.
-Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1996, 1998, 1999, 2000, 2001, 2003 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/sparc64/gcd_1.asm b/gmp/mpn/sparc64/gcd_1.asm
deleted file mode 100644
index e4d8de6a28..0000000000
--- a/gmp/mpn/sparc64/gcd_1.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-dnl SPARC64 mpn_gcd_1.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C UltraSPARC 1&2: 5.1
-C UltraSPARC 3: 5.0
-C UltraSPARC T1: 11.4
-C UltraSPARC T3: 10
-C UltraSPARC T4: 6
-C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
- RODATA
- TYPE(ctz_table,object)
-ctz_table:
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
- SIZE(ctz_table,.-ctz_table)
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 14)
-
-C INPUT PARAMETERS
-define(`up', `%i0')
-define(`n', `%i1')
-define(`v0', `%i2')
-
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_gcd_1)
- save %sp, -192, %sp
- ldx [up+0], %g1 C U low limb
- mov -1, %i4
- or v0, %g1, %g2 C x | y
-
-L(twos):
- inc %i4
- andcc %g2, 1, %g0
- bz,a %xcc, L(twos)
- srlx %g2, 1, %g2
-
-L(divide_strip_y):
- andcc v0, 1, %g0
- bz,a %xcc, L(divide_strip_y)
- srlx v0, 1, v0
-
- cmp n, 1 C if n > 1 we need
- bnz %xcc, L(bmod) C to call bmod_1
- nop
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- srlx %g1, BMOD_THRES_LOG2, %g2
- cmp %g2, v0
- bleu %xcc, L(noreduce)
- mov %g1, %o0
-
-L(bmod):
- mov up, %o0
- mov n, %o1
- mov v0, %o2
- call mpn_modexact_1c_odd
- mov 0, %o3
-
-L(noreduce):
-
- LEA64(ctz_table, i5, g4)
-
- cmp %o0, 0
- bnz %xcc, L(mid)
- and %o0, MASK, %g3 C
-
- return %i7+8
- sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2
-
- ALIGN(16)
-L(top): movcc %xcc, %l4, v0 C v = min(u,v)
- movcc %xcc, %l2, %o0 C u = |v - u]
-L(mid): ldub [%i5+%g3], %g5 C
- brz,a,pn %g3, L(shift_alot) C
- srlx %o0, MAXSHIFT, %o0
- srlx %o0, %g5, %l4 C new u, odd
- subcc v0, %l4, %l2 C v - u, set flags for branch and movcc
- sub %l4, v0, %o0 C u - v
- bnz,pt %xcc, L(top) C
- and %l2, MASK, %g3 C extract low MAXSHIFT bits from (v-u)
-
- return %i7+8
- sllx %o2, %o4, %o0 C CAUTION: v0 alias for o2
-
-L(shift_alot):
- b L(mid)
- and %o0, MASK, %g3 C
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/gmp-mparam.h b/gmp/mpn/sparc64/gmp-mparam.h
index 5ac2c461c5..abf523951d 100644
--- a/gmp/mpn/sparc64/gmp-mparam.h
+++ b/gmp/mpn/sparc64/gmp-mparam.h
@@ -1,139 +1,80 @@
/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2006, 2008, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 500 MHz ultrasparc2 running GNU/Linux */
-
-#define DIVREM_1_NORM_THRESHOLD 3
-#define DIVREM_1_UNNORM_THRESHOLD 4
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 22
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define USE_PREINV_DIVREM_1 1
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 30
-#define MUL_TOOM33_THRESHOLD 187
-#define MUL_TOOM44_THRESHOLD 278
-#define MUL_TOOM6H_THRESHOLD 278
-#define MUL_TOOM8H_THRESHOLD 357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 201
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 199
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 107
-
-#define SQR_BASECASE_THRESHOLD 13
-#define SQR_TOOM2_THRESHOLD 69
-#define SQR_TOOM3_THRESHOLD 116
-#define SQR_TOOM4_THRESHOLD 336
-#define SQR_TOOM6_THRESHOLD 336
-#define SQR_TOOM8_THRESHOLD 454
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 248 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 248, 5}, { 9, 4}, { 19, 6}, { 5, 5}, \
- { 15, 6}, { 8, 5}, { 17, 6}, { 21, 7}, \
- { 19, 8}, { 11, 7}, { 25, 8}, { 15, 7}, \
- { 31, 8}, { 27, 9}, { 15, 8}, { 33, 9}, \
- { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \
- { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \
- { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \
- { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \
- { 79,11}, { 47,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 50
-#define MUL_FFT_THRESHOLD 1984
-
-#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 236, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 19, 7}, { 10, 6}, \
- { 21, 7}, { 21, 8}, { 21, 9}, { 11, 8}, \
- { 23, 9}, { 19, 8}, { 43, 9}, { 23,10}, \
- { 15, 9}, { 43,10}, { 23,11}, { 15,10}, \
- { 31, 9}, { 63,10}, { 47, 8}, { 191,11}, \
- { 31,10}, { 63, 8}, { 255, 7}, { 511, 9}, \
- { 135, 8}, { 271,10}, { 71, 9}, { 143, 8}, \
- { 287, 7}, { 575,11}, { 47, 9}, { 191, 8}, \
- { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 49
-#define SQR_FFT_THRESHOLD 1120
-
-#define MULLO_BASECASE_THRESHOLD 16
-#define MULLO_DC_THRESHOLD 41
-#define MULLO_MUL_N_THRESHOLD 3791
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 100
-#define DC_BDIV_QR_THRESHOLD 47
-#define DC_BDIV_Q_THRESHOLD 174
-
-#define INV_MULMOD_BNM1_THRESHOLD 58
-#define INV_NEWTON_THRESHOLD 13
-#define INV_APPR_THRESHOLD 9
-
-#define BINV_NEWTON_THRESHOLD 187
-#define REDC_1_TO_REDC_2_THRESHOLD 10
-#define REDC_2_TO_REDC_N_THRESHOLD 115
-
-#define MU_DIV_QR_THRESHOLD 680
-#define MU_DIVAPPR_Q_THRESHOLD 618
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 748
-#define MU_BDIV_Q_THRESHOLD 889
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 53
-#define GCD_DC_THRESHOLD 283
-#define GCDEXT_DC_THRESHOLD 186
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 16
-#define SET_STR_DC_THRESHOLD 390
-#define SET_STR_PRECOMPUTE_THRESHOLD 1665
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+ functions instead of open-coding operations in C. */
+#ifndef USE_MORE_MPN
+#define USE_MORE_MPN 1
+#endif
+
+/* Generated by tuneup.c, 2009-01-15, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 33
+#define MUL_TOOM3_THRESHOLD 189
+#define MUL_TOOM44_THRESHOLD 256
+
+#define SQR_BASECASE_THRESHOLD 9
+#define SQR_KARATSUBA_THRESHOLD 70
+#define SQR_TOOM3_THRESHOLD 226
+#define SQR_TOOM4_THRESHOLD 345
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 46
+#define MULLOW_MUL_N_THRESHOLD 143
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 34
+#define POWM_THRESHOLD 116
+
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD_THRESHOLD 51
+#define GCD_DC_THRESHOLD 293
+#define GCDEXT_DC_THRESHOLD 198
+#define JACOBI_BASE_METHOD 3
+
+#define DIVREM_1_NORM_THRESHOLD 3
+#define DIVREM_1_UNNORM_THRESHOLD 3
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1_1_THRESHOLD 12
+#define MOD_1_2_THRESHOLD 13
+#define MOD_1_4_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 6
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 638
+#define SET_STR_PRECOMPUTE_THRESHOLD 1889
+
+#define MUL_FFT_TABLE { 304, 608, 1600, 2816, 7168, 20480, 81920, 196608, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD 216
+#define MUL_FFT_THRESHOLD 1664
+
+#define SQR_FFT_TABLE { 336, 736, 1600, 2816, 7168, 20480, 49152, 196608, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 216
+#define SQR_FFT_THRESHOLD 1312
+
diff --git a/gmp/mpn/sparc64/lshift.asm b/gmp/mpn/sparc64/lshift.asm
index 90bbb454f0..b3bbd9dd99 100644
--- a/gmp/mpn/sparc64/lshift.asm
+++ b/gmp/mpn/sparc64/lshift.asm
@@ -1,140 +1,152 @@
dnl SPARC v9 mpn_lshift
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
+dnl Copyright 1996, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
include(`../config.m4')
-C cycles/limb
-C UltraSPARC 1&2: 2
-C UltraSPARC 3: 2.5
-C UltraSPARC T1: 17.5
-C UltraSPARC T3: 8
-C UltraSPARC T4: 3
+C cycles/limb
+C UltraSPARC 1&2: 2
+C UltraSPARC 3: 3.25
C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`cnt', `%i3')
-
-define(`tcnt', `%i4')
-define(`retval', `%i5')
-define(`u0', `%l0')
-define(`u1', `%l1')
-define(`r0', `%l6')
-define(`r1', `%l7')
-define(`u0_off', `%o0')
-define(`u1_off', `%o1')
-define(`r0_off', `%o2')
-define(`r1_off', `%o3')
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`n',`%i2')
+define(`cnt',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+
+define(`tnc',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_lshift)
- save %sp, -176, %sp
-
- sllx n, 3, n
- sub %g0, cnt, tcnt
-
- sub up, 8, u1_off
- add rp, (5 * 8), r1_off
-
- ldx [n + u1_off], u1 C WAS: up - 8
- add u1_off, (3 * 8), u1_off
-
- sub r1_off, 8, r0_off
- sub u1_off, 8, u0_off
-
- subcc n, (3 * 8), n
- srlx u1, tcnt, retval
-
- bl,pn %xcc, L(end12)
- sllx u1, cnt, %l3
-
- ldx [n + u0_off], u0 C WAS: up - 16
- subcc n, (2 * 8), n
-
- ldx [n + u1_off], u1 C WAS: up - 24
-
- bl,pn %xcc, L(end34)
- srlx u0, tcnt, %l4
-
- b,a L(top)
- ALIGN(16)
-L(top):
- sllx u0, cnt, %l2
- or %l4, %l3, r0
-
- ldx [n + u0_off], u0 C WAS: up - 16
- srlx u1, tcnt, %l5
-
- stx r0, [n + r0_off] C WAS: rp - 8
- subcc n, (2 * 8), n
-
- sllx u1, cnt, %l3
- or %l2, %l5, r1
-
- ldx [n + u1_off], u1 C WAS: up - 24
- srlx u0, tcnt, %l4
-
- bge,pt %xcc, L(top)
- stx r1, [n + r1_off] C WAS: rp - 16
-
-L(end34):
- sllx u0, cnt, %l2
- or %l4, %l3, r0
-
- srlx u1, tcnt, %l5
- stx r0, [n + r0_off] C WAS: rp - 8
-
- or %l2, %l5, r1
- sub n, (2 * 8), %o5
-
- sllx u1, cnt, %l3
- stx r1, [%o5 + r1_off] C WAS: rp - 16
-
-L(end12):
- andcc n, 8, %g0
- bz,pn %xcc, L(done)
- nop
-
- ldx [n + u0_off], u1
- srlx u1, tcnt, %l4
- or %l4, %l3, r0
- stx r0, [r0_off - 24]
- sllx u1, cnt, %l3
-L(done):
- stx %l3, [r0_off - 32]
-
+ save %sp,-160,%sp
+
+ sllx n,3,%g1
+ sub %g0,cnt,tnc C negate shift count
+ add up,%g1,up C make %o1 point at end of src
+ add rp,%g1,rp C make %o0 point at end of res
+ ldx [up-8],u3 C load first limb
+ subcc n,5,n
+ srlx u3,tnc,%i5 C compute function result
+ sllx u3,cnt,%g3
+ bl,pn %icc,.Lend1234
+ fanop
+
+ subcc n,4,n
+ ldx [up-16],u0
+ ldx [up-24],u1
+ add up,-32,up
+ ldx [up-0],u2
+ ldx [up-8],u3
+ srlx u0,tnc,%g2
+
+ bl,pn %icc,.Lend5678
+ fanop
+
+ b,a .Loop
+ .align 16
+.Loop:
+ sllx u0,cnt,%g1
+ or %g3,%g2,%g3
+ ldx [up-16],u0
+ fanop
+C --
+ srlx u1,tnc,%g2
+ subcc n,4,n
+ stx %g3,[rp-8]
+ fanop
+C --
+ sllx u1,cnt,%g3
+ or %g1,%g2,%g1
+ ldx [up-24],u1
+ fanop
+C --
+ srlx u2,tnc,%g2
+ stx %g1,[rp-16]
+ add up,-32,up
+ fanop
+C --
+ sllx u2,cnt,%g1
+ or %g3,%g2,%g3
+ ldx [up-0],u2
+ fanop
+C --
+ srlx u3,tnc,%g2
+ stx %g3,[rp-24]
+ add rp,-32,rp
+ fanop
+C --
+ sllx u3,cnt,%g3
+ or %g1,%g2,%g1
+ ldx [up-8],u3
+ fanop
+C --
+ srlx u0,tnc,%g2
+ stx %g1,[rp-0]
+ bge,pt %icc,.Loop
+ fanop
+C --
+.Lend5678:
+ sllx u0,cnt,%g1
+ or %g3,%g2,%g3
+ srlx u1,tnc,%g2
+ stx %g3,[rp-8]
+ sllx u1,cnt,%g3
+ or %g1,%g2,%g1
+ srlx u2,tnc,%g2
+ stx %g1,[rp-16]
+ sllx u2,cnt,%g1
+ or %g3,%g2,%g3
+ srlx u3,tnc,%g2
+ stx %g3,[rp-24]
+ add rp,-32,rp
+ sllx u3,cnt,%g3 C carry...
+ or %g1,%g2,%g1
+ stx %g1,[rp-0]
+
+.Lend1234:
+ addcc n,4,n
+ bz,pn %icc,.Lret
+ fanop
+.Loop0:
+ add rp,-8,rp
+ subcc n,1,n
+ ldx [up-16],u3
+ add up,-8,up
+ srlx u3,tnc,%g2
+ or %g3,%g2,%g3
+ stx %g3,[rp]
+ sllx u3,cnt,%g3
+ bnz,pt %icc,.Loop0
+ fanop
+.Lret:
+ stx %g3,[rp-8]
+ mov %i5,%i0
ret
- restore retval, 0, %o0
-EPILOGUE()
+ restore
+EPILOGUE(mpn_lshift)
diff --git a/gmp/mpn/sparc64/lshiftc.asm b/gmp/mpn/sparc64/lshiftc.asm
deleted file mode 100644
index 4a0f0a3e40..0000000000
--- a/gmp/mpn/sparc64/lshiftc.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl SPARC v9 mpn_lshiftc
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC 1&2: 3
-C UltraSPARC 3: 3
-C UltraSPARC T1: 17
-C UltraSPARC T3: 10
-C UltraSPARC T4: 3.5
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`cnt', `%i3')
-
-define(`tcnt', `%i4')
-define(`retval', `%i5')
-define(`u0', `%l0')
-define(`u1', `%l1')
-define(`r0', `%l6')
-define(`r1', `%l7')
-define(`u0_off', `%o0')
-define(`u1_off', `%o1')
-define(`r0_off', `%o2')
-define(`r1_off', `%o3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_lshiftc)
- save %sp, -176, %sp
-
- sllx n, 3, n
- sub %g0, cnt, tcnt
-
- sub up, 8, u1_off
- add rp, (5 * 8), r1_off
-
- ldx [n + u1_off], u1 C WAS: up - 8
- add u1_off, (3 * 8), u1_off
-
- sub r1_off, 8, r0_off
- sub u1_off, 8, u0_off
-
- subcc n, (3 * 8), n
- srlx u1, tcnt, retval
-
- bl,pn %xcc, L(end12)
- sllx u1, cnt, %l3
-
- ldx [n + u0_off], u0 C WAS: up - 16
- subcc n, (2 * 8), n
-
- ldx [n + u1_off], u1 C WAS: up - 24
-
- bl,pn %xcc, L(end34)
- srlx u0, tcnt, %l4
-
- b,a L(top)
- ALIGN(16)
-L(top):
- not %l3, %l3
- sllx u0, cnt, %l2
-
- andn %l3, %l4, r0
- ldx [n + u0_off], u0 C WAS: up - 16
-
- srlx u1, tcnt, %l5
- stx r0, [n + r0_off] C WAS: rp - 8
-
- subcc n, (2 * 8), n
- not %l2, %l2
-
- sllx u1, cnt, %l3
- andn %l2, %l5, r1
-
- ldx [n + u1_off], u1 C WAS: up - 24
- srlx u0, tcnt, %l4
-
- bge,pt %xcc, L(top)
- stx r1, [n + r1_off] C WAS: rp - 16
-
-L(end34):
- not %l3, %l3
- sllx u0, cnt, %l2
-
- andn %l3, %l4, r0
- srlx u1, tcnt, %l5
-
- stx r0, [n + r0_off] C WAS: rp - 8
- not %l2, %l2
-
- andn %l2, %l5, r1
- sub n, (2 * 8), %o5
-
- sllx u1, cnt, %l3
- stx r1, [%o5 + r1_off] C WAS: rp - 16
-
-L(end12):
- andcc n, 8, %g0
- bz %xcc, L(done)+4
- not %l3, %l3
-
- ldx [n + u0_off], u1
- srlx u1, tcnt, %l4
- andn %l3, %l4, r0
- stx r0, [r0_off - 24]
- sllx u1, cnt, %l3
-L(done):
- not %l3, %l3
- stx %l3, [r0_off - 32]
-
- ret
- restore retval, 0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/mod_1.c b/gmp/mpn/sparc64/mod_1.c
index f1c51970d9..757ae01b95 100644
--- a/gmp/mpn/sparc64/mod_1.c
+++ b/gmp/mpn/sparc64/mod_1.c
@@ -1,33 +1,22 @@
/* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
-Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2003 Free Software Foundation,
Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
@@ -57,8 +46,8 @@ see https://www.gnu.org/licenses/. */
sizes, but at size==2 it was only about the same speed and at size==3 was
slower. */
-static mp_limb_t
-mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
+mp_limb_t
+mpn_mod_1 (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
{
int norm, norm_rshift;
mp_limb_t src_high_limb;
@@ -186,54 +175,3 @@ mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb
return r >> norm;
}
}
-
-mp_limb_t
-mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
-{
- ASSERT (n >= 0);
- ASSERT (b != 0);
-
- /* Should this be handled at all? Rely on callers? Note un==0 is currently
- required by mpz/fdiv_r_ui.c and possibly other places. */
- if (n == 0)
- return 0;
-
- if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
- {
- if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
- {
- return mpn_mod_1_anynorm (ap, n, b);
- }
- else
- {
- mp_limb_t pre[4];
- mpn_mod_1_1p_cps (pre, b);
- return mpn_mod_1_1p (ap, n, b, pre);
- }
- }
- else
- {
- if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
- {
- return mpn_mod_1_anynorm (ap, n, b);
- }
- else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
- {
- mp_limb_t pre[4];
- mpn_mod_1_1p_cps (pre, b);
- return mpn_mod_1_1p (ap, n, b << pre[1], pre);
- }
- else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
- {
- mp_limb_t pre[5];
- mpn_mod_1s_2p_cps (pre, b);
- return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
- }
- else
- {
- mp_limb_t pre[7];
- mpn_mod_1s_4p_cps (pre, b);
- return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
- }
- }
-}
diff --git a/gmp/mpn/sparc64/mod_1_4.c b/gmp/mpn/sparc64/mod_1_4.c
deleted file mode 100644
index cc1b9484bc..0000000000
--- a/gmp/mpn/sparc64/mod_1_4.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/* mpn_mod_1s_4p (ap, n, b, cps)
- Divide (ap,,n) by b. Return the single-limb remainder.
- Requires that d < B / 4.
-
- Contributed to the GNU project by Torbjorn Granlund.
- Based on a suggestion by Peter L. Montgomery.
-
- THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
- SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
- GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-Copyright 2008-2010 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-#include "mpn/sparc64/sparc64.h"
-
-void
-mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
-{
- mp_limb_t bi;
- mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
- int cnt;
-
- ASSERT (b <= (~(mp_limb_t) 0) / 4);
-
- count_leading_zeros (cnt, b);
-
- b <<= cnt;
- invert_limb (bi, b);
-
- cps[0] = bi;
- cps[1] = cnt;
-
- B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
- ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
- cps[2] = B1modb >> cnt;
-
- udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
- cps[3] = B2modb >> cnt;
-
- udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
- cps[4] = B3modb >> cnt;
-
- udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
- cps[5] = B4modb >> cnt;
-
- udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
- cps[6] = B5modb >> cnt;
-
-#if WANT_ASSERT
- {
- int i;
- b = cps[2];
- for (i = 3; i <= 6; i++)
- {
- b += cps[i];
- ASSERT (b >= cps[i]);
- }
- }
-#endif
-}
-
-mp_limb_t
-mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
-{
- mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
- mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
- mp_size_t i;
- int cnt;
-
- ASSERT (n >= 1);
-
- B1modb = cps[2];
- B2modb = cps[3];
- B3modb = cps[4];
- B4modb = cps[5];
- B5modb = cps[6];
-
- if ((b >> 32) == 0)
- {
- switch (n & 3)
- {
- case 0:
- umul_ppmm_s (ph, pl, ap[n - 3], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
- umul_ppmm_s (ch, cl, ap[n - 2], B2modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
- umul_ppmm_s (rh, rl, ap[n - 1], B3modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 4;
- break;
- case 1:
- rh = 0;
- rl = ap[n - 1];
- n -= 1;
- break;
- case 2:
- rh = ap[n - 1];
- rl = ap[n - 2];
- n -= 2;
- break;
- case 3:
- umul_ppmm_s (ph, pl, ap[n - 2], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
- umul_ppmm_s (rh, rl, ap[n - 1], B2modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 3;
- break;
- }
-
- for (i = n - 4; i >= 0; i -= 4)
- {
- /* rr = ap[i] < B
- + ap[i+1] * (B mod b) <= (B-1)(b-1)
- + ap[i+2] * (B^2 mod b) <= (B-1)(b-1)
- + ap[i+3] * (B^3 mod b) <= (B-1)(b-1)
- + LO(rr) * (B^4 mod b) <= (B-1)(b-1)
- + HI(rr) * (B^5 mod b) <= (B-1)(b-1)
- */
- umul_ppmm_s (ph, pl, ap[i + 1], B1modb);
- add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
-
- umul_ppmm_s (ch, cl, ap[i + 2], B2modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm_s (ch, cl, ap[i + 3], B3modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm_s (ch, cl, rl, B4modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm_s (rh, rl, rh, B5modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- }
-
- umul_ppmm_s (rh, cl, rh, B1modb);
- add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
- }
- else
- {
- switch (n & 3)
- {
- case 0:
- umul_ppmm (ph, pl, ap[n - 3], B1modb);
- add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
- umul_ppmm (ch, cl, ap[n - 2], B2modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
- umul_ppmm (rh, rl, ap[n - 1], B3modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 4;
- break;
- case 1:
- rh = 0;
- rl = ap[n - 1];
- n -= 1;
- break;
- case 2:
- rh = ap[n - 1];
- rl = ap[n - 2];
- n -= 2;
- break;
- case 3:
- umul_ppmm (ph, pl, ap[n - 2], B1modb);
- add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
- umul_ppmm (rh, rl, ap[n - 1], B2modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- n -= 3;
- break;
- }
-
- for (i = n - 4; i >= 0; i -= 4)
- {
- /* rr = ap[i] < B
- + ap[i+1] * (B mod b) <= (B-1)(b-1)
- + ap[i+2] * (B^2 mod b) <= (B-1)(b-1)
- + ap[i+3] * (B^3 mod b) <= (B-1)(b-1)
- + LO(rr) * (B^4 mod b) <= (B-1)(b-1)
- + HI(rr) * (B^5 mod b) <= (B-1)(b-1)
- */
- umul_ppmm (ph, pl, ap[i + 1], B1modb);
- add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
-
- umul_ppmm (ch, cl, ap[i + 2], B2modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm (ch, cl, ap[i + 3], B3modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm (ch, cl, rl, B4modb);
- add_ssaaaa (ph, pl, ph, pl, ch, cl);
-
- umul_ppmm (rh, rl, rh, B5modb);
- add_ssaaaa (rh, rl, rh, rl, ph, pl);
- }
-
- umul_ppmm (rh, cl, rh, B1modb);
- add_ssaaaa (rh, rl, rh, rl, 0, cl);
- }
-
- bi = cps[0];
- cnt = cps[1];
-
- r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
- udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
-
- return r >> cnt;
-}
diff --git a/gmp/mpn/sparc64/mode1o.c b/gmp/mpn/sparc64/mode1o.c
index 7c8fc1cf3d..5ec97c5cd4 100644
--- a/gmp/mpn/sparc64/mode1o.c
+++ b/gmp/mpn/sparc64/mode1o.c
@@ -4,33 +4,22 @@
CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
FUTURE GNU MP RELEASES.
-Copyright 2000-2003 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
diff --git a/gmp/mpn/sparc64/ultrasparc1234/mul_1.asm b/gmp/mpn/sparc64/mul_1.asm
index 871d562fcb..e57e822bae 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/mul_1.asm
+++ b/gmp/mpn/sparc64/mul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
dnl the result in a second limb vector.
-dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc.
+dnl Copyright 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -147,7 +136,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
fmuld u32, v00, r32
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_two_or_more
+ bnz,pt %icc, .L_two_or_more
fmuld u32, v16, r48
.L_one:
@@ -222,7 +211,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
faddd p16, r80, a16
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_three_or_more
+ bnz,pt %icc, .L_three_or_more
fmuld u32, v16, r48
.L_two:
@@ -299,7 +288,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
faddd p16, r80, a16
fmuld u00, v48, p48
addcc %i2, 8, %i2
- bnz,pt %xcc, .L_four_or_more
+ bnz,pt %icc, .L_four_or_more
fmuld u32, v16, r48
.L_three:
@@ -381,7 +370,7 @@ C The software pipeline is very deep, requiring 4 feed-in stages.
fmuld u00, v48, p48
add cy, %g5, %o4 C x = prev(i00) + cy
addcc %i2, 8, %i2
- bnz,pt %xcc, .Loop
+ bnz,pt %icc, .Loop
fmuld u32, v16, r48
.L_four:
@@ -455,7 +444,7 @@ C 12
C 13
add cy, %g5, %o4 C x = prev(i00) + cy
addcc %i2, 8, %i2
- bnz,pt %xcc, .Loop
+ bnz,pt %icc, .Loop
fmuld u32, v16, r48
C END MAIN LOOP
diff --git a/gmp/mpn/sparc64/rshift.asm b/gmp/mpn/sparc64/rshift.asm
index 3f8e11fee7..691fe012d3 100644
--- a/gmp/mpn/sparc64/rshift.asm
+++ b/gmp/mpn/sparc64/rshift.asm
@@ -1,142 +1,149 @@
dnl SPARC v9 mpn_rshift
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
+dnl Copyright 1996, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
include(`../config.m4')
-C cycles/limb
-C UltraSPARC 1&2: 2
-C UltraSPARC 3: 2.5
-C UltraSPARC T1: 17.5
-C UltraSPARC T3: 8
-C UltraSPARC T4: 3
+C cycles/limb
+C UltraSPARC 1&2: 2
+C UltraSPARC 3: 3.25
C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`cnt', `%i3')
-
-define(`tcnt', `%i4')
-define(`retval', `%i5')
-define(`u0', `%l0')
-define(`u1', `%l1')
-define(`r0', `%l6')
-define(`r1', `%l7')
-define(`u0_off', `%o0')
-define(`u1_off', `%o1')
-define(`r0_off', `%o2')
-define(`r1_off', `%o3')
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`n',`%i2')
+define(`cnt',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+
+define(`tnc',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_rshift)
- save %sp, -176, %sp
-
- sllx n, 3, n
- sub %g0, cnt, tcnt
-
- add up, n, up
- add rp, n, rp
-
- neg n, n
- sub up, (2 * 8), u0_off
- sub rp, (5 * 8), r0_off
-
- ldx [n + up], u1 C WAS: up + 0
- sub u0_off, (1 * 8), u1_off
- sub r0_off, (1 * 8), r1_off
-
- subcc n, -(3 * 8), n
- sllx u1, tcnt, retval
-
- bg,pn %xcc, L(end12)
- srlx u1, cnt, %l3
-
- ldx [n + u0_off], u0 C WAS: up + 0
- subcc n, -(2 * 8), n
-
- ldx [n + u1_off], u1 C WAS: up + 8
-
- bg,pn %xcc, L(end34)
- sllx u0, tcnt, %l4
-
- b,a L(top)
- ALIGN(16)
-L(top):
- srlx u0, cnt, %l2
- or %l3, %l4, r0
-
- ldx [n + u0_off], u0 C WAS: up + 0
- sllx u1, tcnt, %l5
-
- stx r0, [n + r0_off] C WAS: rp + 0
- subcc n, -(2 * 8), n
-
- srlx u1, cnt, %l3
- or %l2, %l5, r1
-
- ldx [n + u1_off], u1 C WAS: up + 8
- sllx u0, tcnt, %l4
-
- ble,pt %xcc, L(top)
- stx r1, [n + r1_off] C WAS: rp + 8
-
-L(end34):
- srlx u0, cnt, %l2
- or %l3, %l4, r0
-
- sllx u1, tcnt, %l5
- stx r0, [n + r0_off] C WAS: rp + 0
-
- or %l2, %l5, r1
- sub n, -(2 * 8), %o5
-
- srlx u1, cnt, %l3
- stx r1, [%o5 + r1_off] C WAS: rp + 8
-
-L(end12):
- andcc n, 8, %g0
- bz,pn %xcc, L(done)
- nop
-
- ldx [n + u0_off], u1
- sllx u1, tcnt, %l4
- or %l3, %l4, r0
- stx r0, [r0_off + 24]
- srlx u1, cnt, %l3
-L(done):
- stx %l3, [r0_off + 32]
-
+ save %sp,-160,%sp
+
+ sub %g0,cnt,tnc C negate shift count
+ ldx [up],u3 C load first limb
+ subcc n,5,n
+ sllx u3,tnc,%i5 C compute function result
+ srlx u3,cnt,%g3
+ bl,pn %icc,.Lend1234
+ fanop
+
+ subcc n,4,n
+ ldx [up+8],u0
+ ldx [up+16],u1
+ add up,32,up
+ ldx [up-8],u2
+ ldx [up+0],u3
+ sllx u0,tnc,%g2
+
+ bl,pn %icc,.Lend5678
+ fanop
+
+ b,a .Loop
+ .align 16
+.Loop:
+ srlx u0,cnt,%g1
+ or %g3,%g2,%g3
+ ldx [up+8],u0
+ fanop
+C --
+ sllx u1,tnc,%g2
+ subcc n,4,n
+ stx %g3,[rp+0]
+ fanop
+C --
+ srlx u1,cnt,%g3
+ or %g1,%g2,%g1
+ ldx [up+16],u1
+ fanop
+C --
+ sllx u2,tnc,%g2
+ stx %g1,[rp+8]
+ add up,32,up
+ fanop
+C --
+ srlx u2,cnt,%g1
+ or %g3,%g2,%g3
+ ldx [up-8],u2
+ fanop
+C --
+ sllx u3,tnc,%g2
+ stx %g3,[rp+16]
+ add rp,32,rp
+ fanop
+C --
+ srlx u3,cnt,%g3
+ or %g1,%g2,%g1
+ ldx [up+0],u3
+ fanop
+C --
+ sllx u0,tnc,%g2
+ stx %g1,[rp-8]
+ bge,pt %icc,.Loop
+ fanop
+C --
+.Lend5678:
+ srlx u0,cnt,%g1
+ or %g3,%g2,%g3
+ sllx u1,tnc,%g2
+ stx %g3,[rp+0]
+ srlx u1,cnt,%g3
+ or %g1,%g2,%g1
+ sllx u2,tnc,%g2
+ stx %g1,[rp+8]
+ srlx u2,cnt,%g1
+ or %g3,%g2,%g3
+ sllx u3,tnc,%g2
+ stx %g3,[rp+16]
+ add rp,32,rp
+ srlx u3,cnt,%g3 C carry...
+ or %g1,%g2,%g1
+ stx %g1,[rp-8]
+
+.Lend1234:
+ addcc n,4,n
+ bz,pn %icc,.Lret
+ fanop
+.Loop0:
+ add rp,8,rp
+ subcc n,1,n
+ ldx [up+8],u3
+ add up,8,up
+ sllx u3,tnc,%g2
+ or %g3,%g2,%g3
+ stx %g3,[rp-8]
+ srlx u3,cnt,%g3
+ bnz,pt %icc,.Loop0
+ fanop
+.Lret:
+ stx %g3,[rp+0]
+ mov %i5,%i0
ret
- restore retval, 0, %o0
-EPILOGUE()
+ restore
+EPILOGUE(mpn_rshift)
diff --git a/gmp/mpn/sparc64/sec_tabselect.asm b/gmp/mpn/sparc64/sec_tabselect.asm
deleted file mode 100644
index 22e0dc5ef1..0000000000
--- a/gmp/mpn/sparc64/sec_tabselect.asm
+++ /dev/null
@@ -1,162 +0,0 @@
-dnl SPARC v9 mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund and David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC 1&2: 2 hopefully
-C UltraSPARC 3: 3
-C UltraSPARC T1: 17
-C UltraSPARC T3: ?
-C UltraSPARC T4/T5: 2.25 hopefully
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`tp', `%i1')
-define(`n', `%i2')
-define(`nents', `%i3')
-define(`which', `%i4')
-
-define(`i', `%g1')
-define(`j', `%g3')
-define(`stride', `%g4')
-define(`tporig', `%g5')
-define(`mask', `%o0')
-
-define(`data0', `%l0')
-define(`data1', `%l1')
-define(`data2', `%l2')
-define(`data3', `%l3')
-define(`t0', `%l4')
-define(`t1', `%l5')
-define(`t2', `%l6')
-define(`t3', `%l7')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_sec_tabselect)
- save %sp, -176, %sp
-
- sllx n, 3, stride
- sub n, 4, j
- brlz j, L(outer_end)
- mov tp, tporig
-
-L(outer_loop):
- clr data0
- clr data1
- clr data2
- clr data3
- mov tporig, tp
- mov nents, i
- mov which, %o1
-
-L(top): subcc %o1, 1, %o1 C set carry iff o1 = 0
- ldx [tp + 0], t0
- subc %g0, %g0, mask
- ldx [tp + 8], t1
- sub i, 1, i
- ldx [tp + 16], t2
- ldx [tp + 24], t3
- add tp, stride, tp
- and t0, mask, t0
- and t1, mask, t1
- or t0, data0, data0
- and t2, mask, t2
- or t1, data1, data1
- and t3, mask, t3
- or t2, data2, data2
- brnz i, L(top)
- or t3, data3, data3
-
- stx data0, [rp + 0]
- subcc j, 4, j
- stx data1, [rp + 8]
- stx data2, [rp + 16]
- stx data3, [rp + 24]
- add tporig, (4 * 8), tporig
-
- brgez j, L(outer_loop)
- add rp, (4 * 8), rp
-L(outer_end):
-
-
- andcc n, 2, %g0
- be L(b0x)
- nop
-L(b1x): clr data0
- clr data1
- mov tporig, tp
- mov nents, i
- mov which, %o1
-
-L(tp2): subcc %o1, 1, %o1
- ldx [tp + 0], t0
- subc %g0, %g0, mask
- ldx [tp + 8], t1
- sub i, 1, i
- add tp, stride, tp
- and t0, mask, t0
- and t1, mask, t1
- or t0, data0, data0
- brnz i, L(tp2)
- or t1, data1, data1
-
- stx data0, [rp + 0]
- stx data1, [rp + 8]
- add tporig, (2 * 8), tporig
- add rp, (2 * 8), rp
-
-
-L(b0x): andcc n, 1, %g0
- be L(b00)
- nop
-L(b01): clr data0
- mov tporig, tp
- mov nents, i
- mov which, %o1
-
-L(tp1): subcc %o1, 1, %o1
- ldx [tp + 0], t0
- subc %g0, %g0, mask
- sub i, 1, i
- add tp, stride, tp
- and t0, mask, t0
- brnz i, L(tp1)
- or t0, data0, data0
-
- stx data0, [rp + 0]
-
-L(b00): ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/sparc64.h b/gmp/mpn/sparc64/sparc64.h
index 09fc16d46a..945e422f5a 100644
--- a/gmp/mpn/sparc64/sparc64.h
+++ b/gmp/mpn/sparc64/sparc64.h
@@ -9,28 +9,17 @@ Copyright 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define LOW32(x) ((x) & 0xFFFFFFFF)
@@ -140,24 +129,6 @@ Error, error, unknown limb endianness;
#endif
-/* Multiply u anv v, where v < 2^32. */
-#define umul_ppmm_s(w1, w0, u, v) \
- do { \
- UWtype __x0, __x2; \
- UWtype __ul, __vl, __uh; \
- UWtype __u = (u), __v = (v); \
- \
- __ul = __ll_lowpart (__u); \
- __uh = __ll_highpart (__u); \
- __vl = __ll_lowpart (__v); \
- \
- __x0 = (UWtype) __ul * __vl; \
- __x2 = (UWtype) __uh * __vl; \
- \
- (w1) = (__x2 + (__x0 >> W_TYPE_SIZE/2)) >> W_TYPE_SIZE/2; \
- (w0) = (__x2 << W_TYPE_SIZE/2) + __x0; \
- } while (0)
-
/* Count the leading zeros on a limb, but assuming it fits in 32 bits.
The count returned will be in the range 32 to 63.
This is the 32-bit generic C count_leading_zeros from longlong.h. */
diff --git a/gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/gmp/mpn/sparc64/sqr_diagonal.asm
index 43c69d31d1..fbbb4ff456 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
+++ b/gmp/mpn/sparc64/sqr_diagonal.asm
@@ -3,30 +3,19 @@ dnl SPARC v9 64-bit mpn_sqr_diagonal.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc64/ultrasparc1234/sub_n.asm b/gmp/mpn/sparc64/sub_n.asm
index 9fb7f70747..e6fe9ee62c 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/sub_n.asm
+++ b/gmp/mpn/sparc64/sub_n.asm
@@ -1,33 +1,22 @@
dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
dnl store difference in a third limb vector.
-dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -65,24 +54,14 @@ define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
-PROLOGUE(mpn_sub_nc)
- save %sp,-160,%sp
-
- fitod %f0,%f0 C make sure f0 contains small, quiet number
- subcc n,4,%g0
- bl,pn %xcc,.Loop0
- nop
- b,a L(com)
-EPILOGUE()
-
PROLOGUE(mpn_sub_n)
save %sp,-160,%sp
fitod %f0,%f0 C make sure f0 contains small, quiet number
subcc n,4,%g0
- bl,pn %xcc,.Loop0
+ bl,pn %icc,.Loop0
mov 0,cy
-L(com):
+
ldx [up+0],u0
ldx [vp+0],v0
add up,32,up
@@ -95,15 +74,15 @@ L(com):
ldx [vp-8],v3
subcc n,8,n
sub u0,v0,%g1 C main sub
- sub %g1,cy,%g5 C carry sub
+ sub %g1,cy,%g4 C carry sub
orn u0,v0,%g2
- bl,pn %xcc,.Lend4567
+ bl,pn %icc,.Lend4567
fanop
b,a .Loop
.align 16
C START MAIN LOOP
-.Loop: orn %g5,%g2,%g2
+.Loop: orn %g4,%g2,%g2
andn u0,v0,%g3
ldx [up+0],u0
fanop
@@ -115,15 +94,15 @@ C --
C --
srlx %g2,63,cy
sub u1,v1,%g1
- stx %g5,[rp+0]
+ stx %g4,[rp+0]
fanop
C --
- sub %g1,cy,%g5
+ sub %g1,cy,%g4
orn u1,v1,%g2
fmnop
fanop
C --
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u1,v1,%g3
ldx [up-24],u1
fanop
@@ -135,15 +114,15 @@ C --
C --
srlx %g2,63,cy
sub u2,v2,%g1
- stx %g5,[rp+8]
+ stx %g4,[rp+8]
fanop
C --
- sub %g1,cy,%g5
+ sub %g1,cy,%g4
orn u2,v2,%g2
fmnop
fanop
C --
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u2,v2,%g3
ldx [up-16],u2
fanop
@@ -155,15 +134,15 @@ C --
C --
srlx %g2,63,cy
sub u3,v3,%g1
- stx %g5,[rp-16]
+ stx %g4,[rp-16]
fanop
C --
- sub %g1,cy,%g5
+ sub %g1,cy,%g4
orn u3,v3,%g2
fmnop
fanop
C --
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u3,v3,%g3
ldx [up-8],u3
fanop
@@ -175,48 +154,48 @@ C --
C --
srlx %g2,63,cy
sub u0,v0,%g1
- stx %g5,[rp-8]
+ stx %g4,[rp-8]
fanop
C --
- sub %g1,cy,%g5
+ sub %g1,cy,%g4
orn u0,v0,%g2
- bge,pt %xcc,.Loop
+ bge,pt %icc,.Loop
fanop
C END MAIN LOOP
.Lend4567:
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u0,v0,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
sub u1,v1,%g1
- stx %g5,[rp+0]
- sub %g1,cy,%g5
+ stx %g4,[rp+0]
+ sub %g1,cy,%g4
orn u1,v1,%g2
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u1,v1,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
sub u2,v2,%g1
- stx %g5,[rp+8]
- sub %g1,cy,%g5
+ stx %g4,[rp+8]
+ sub %g1,cy,%g4
orn u2,v2,%g2
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u2,v2,%g3
andn %g2,%g3,%g2
add rp,32,rp
srlx %g2,63,cy
sub u3,v3,%g1
- stx %g5,[rp-16]
- sub %g1,cy,%g5
+ stx %g4,[rp-16]
+ sub %g1,cy,%g4
orn u3,v3,%g2
- orn %g5,%g2,%g2
+ orn %g4,%g2,%g2
andn u3,v3,%g3
andn %g2,%g3,%g2
srlx %g2,63,cy
- stx %g5,[rp-8]
+ stx %g4,[rp-8]
addcc n,4,n
- bz,pn %xcc,.Lret
+ bz,pn %icc,.Lret
fanop
.Loop0: ldx [up],u0
@@ -227,12 +206,12 @@ C END MAIN LOOP
subcc n,1,n
sub u0,v0,%g1
orn u0,v0,%g2
- sub %g1,cy,%g5
+ sub %g1,cy,%g4
andn u0,v0,%g3
- orn %g5,%g2,%g2
- stx %g5,[rp-8]
+ orn %g4,%g2,%g2
+ stx %g4,[rp-8]
andn %g2,%g3,%g2
- bnz,pt %xcc,.Loop0
+ bnz,pt %icc,.Loop0
srlx %g2,63,cy
.Lret: mov cy,%i0
diff --git a/gmp/mpn/sparc64/ultrasparc1234/submul_1.asm b/gmp/mpn/sparc64/submul_1.asm
index 0bdb566b9f..ba91200315 100644
--- a/gmp/mpn/sparc64/ultrasparc1234/submul_1.asm
+++ b/gmp/mpn/sparc64/submul_1.asm
@@ -1,33 +1,22 @@
dnl SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 2001-2003 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm b/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm
deleted file mode 100644
index 47286d569e..0000000000
--- a/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm
+++ /dev/null
@@ -1,165 +0,0 @@
-dnl SPARC v9 mpn_lshiftc
-
-dnl Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC 1&2: 3
-C UltraSPARC 3: 2.67
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`cnt',`%i3')
-
-define(`u0', `%l0')
-define(`u1', `%l2')
-define(`u2', `%l4')
-define(`u3', `%l6')
-
-define(`tnc',`%i4')
-
-define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_lshiftc)
- save %sp,-160,%sp
-
- sllx n,3,%g1
- sub %g0,cnt,tnc C negate shift count
- add up,%g1,up C make %o1 point at end of src
- add rp,%g1,rp C make %o0 point at end of res
- ldx [up-8],u3 C load first limb
- subcc n,5,n
- srlx u3,tnc,%i5 C compute function result
- bl,pn %xcc,.Lend1234
- sllx u3,cnt,%g3
-
- subcc n,4,n
- ldx [up-16],u0
- ldx [up-24],u1
- add up,-32,up
- ldx [up-0],u2
- ldx [up-8],u3
- srlx u0,tnc,%g2
- bl,pn %xcc,.Lend5678
- not %g3, %g3
-
- b,a .Loop
- ALIGN(16)
-.Loop:
- sllx u0,cnt,%g1
- andn %g3,%g2,%g3
- ldx [up-16],u0
- fanop
-C --
- srlx u1,tnc,%g2
- subcc n,4,n
- stx %g3,[rp-8]
- not %g1, %g1
-C --
- sllx u1,cnt,%g3
- andn %g1,%g2,%g1
- ldx [up-24],u1
- fanop
-C --
- srlx u2,tnc,%g2
- stx %g1,[rp-16]
- add up,-32,up
- not %g3, %g3
-C --
- sllx u2,cnt,%g1
- andn %g3,%g2,%g3
- ldx [up-0],u2
- fanop
-C --
- srlx u3,tnc,%g2
- stx %g3,[rp-24]
- add rp,-32,rp
- not %g1, %g1
-C --
- sllx u3,cnt,%g3
- andn %g1,%g2,%g1
- ldx [up-8],u3
- fanop
-C --
- srlx u0,tnc,%g2
- stx %g1,[rp-0]
- bge,pt %xcc,.Loop
- not %g3, %g3
-C --
-.Lend5678:
- sllx u0,cnt,%g1
- andn %g3,%g2,%g3
- srlx u1,tnc,%g2
- stx %g3,[rp-8]
- not %g1, %g1
- sllx u1,cnt,%g3
- andn %g1,%g2,%g1
- srlx u2,tnc,%g2
- stx %g1,[rp-16]
- not %g3, %g3
- sllx u2,cnt,%g1
- andn %g3,%g2,%g3
- srlx u3,tnc,%g2
- stx %g3,[rp-24]
- add rp,-32,rp
- not %g1, %g1
- sllx u3,cnt,%g3 C carry...
- andn %g1,%g2,%g1
- stx %g1,[rp-0]
-
-.Lend1234:
- addcc n,4,n
- bz,pn %xcc,.Lret
- fanop
-.Loop0:
- add rp,-8,rp
- subcc n,1,n
- ldx [up-16],u3
- add up,-8,up
- srlx u3,tnc,%g2
- not %g3, %g3
- andn %g3,%g2,%g3
- stx %g3,[rp]
- sllx u3,cnt,%g3
- bnz,pt %xcc,.Loop0
- fanop
-.Lret:
- not %g3, %g3
- stx %g3,[rp-8]
- mov %i5,%i0
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h b/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h
deleted file mode 100644
index 0c525bbdcf..0000000000
--- a/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/* ultrasparc3/4 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010, 2014 Free
-Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
-/* FFT tuning limit = 60000000 */
-/* Generated by tuneup.c, 2014-03-14, gcc 3.4 */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 20
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 25
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 93
-#define MUL_TOOM44_THRESHOLD 139
-#define MUL_TOOM6H_THRESHOLD 165
-#define MUL_TOOM8H_THRESHOLD 278
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 104
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 51
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 67
-
-#define SQR_BASECASE_THRESHOLD 7
-#define SQR_TOOM2_THRESHOLD 71
-#define SQR_TOOM3_THRESHOLD 98
-#define SQR_TOOM4_THRESHOLD 175
-#define SQR_TOOM6_THRESHOLD 190
-#define SQR_TOOM8_THRESHOLD 339
-
-#define MULMID_TOOM42_THRESHOLD 40
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 9
-
-#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 220, 5}, { 13, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
- { 11, 7}, { 24, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
- { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \
- { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \
- { 31, 9}, { 63, 8}, { 127, 7}, { 255, 9}, \
- { 67,10}, { 39, 9}, { 79, 8}, { 159, 9}, \
- { 83,10}, { 47, 9}, { 95, 8}, { 191, 7}, \
- { 383,10}, { 55,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 7}, { 511,10}, { 71, 9}, \
- { 143, 8}, { 287, 7}, { 575,10}, { 79, 9}, \
- { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \
- { 191, 8}, { 383,10}, { 103, 9}, { 207, 8}, \
- { 415,10}, { 111,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
- { 575,11}, { 79,10}, { 175, 9}, { 351,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \
- { 415,11}, { 111,10}, { 223,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
- { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \
- { 639,11}, { 175,10}, { 351, 9}, { 703,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415,11}, { 223,10}, { 447,13}, { 63,12}, \
- { 127,11}, { 287,10}, { 575,12}, { 159,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 415,12}, \
- { 223,11}, { 479,13}, { 127,12}, { 287,11}, \
- { 575,12}, { 351,13}, { 191,12}, { 415,11}, \
- { 831,12}, { 479,14}, { 127,13}, { 255,12}, \
- { 575,13}, { 319,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 831,13}, { 447,12}, { 895,14}, \
- { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
- { 1215,13}, { 703,14}, { 383,13}, { 831,12}, \
- { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \
- { 1151,14}, { 639,13}, { 1407,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
- { 1023,13}, { 2047,14}, { 1151,13}, { 2303,14}, \
- { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \
- { 1023,14}, { 2303,15}, { 1279,14}, { 2815,15}, \
- { 1535,14}, { 3199,15}, { 1791,14}, { 3583,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 172
-#define MUL_FFT_THRESHOLD 2240
-
-#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 244, 5}, { 8, 4}, { 17, 5}, { 15, 6}, \
- { 8, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 10, 6}, { 21, 7}, { 17, 8}, \
- { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \
- { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
- { 31, 9}, { 19, 8}, { 39, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 71, 9}, { 143, 8}, { 287, 7}, { 575,10}, \
- { 79, 9}, { 159, 8}, { 319,11}, { 47, 9}, \
- { 191, 8}, { 383, 7}, { 767, 9}, { 207,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
- { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \
- { 639,10}, { 175, 9}, { 351, 8}, { 703,10}, \
- { 191, 9}, { 383, 8}, { 767,10}, { 207, 9}, \
- { 415, 8}, { 831,10}, { 223, 9}, { 447, 8}, \
- { 895,12}, { 63,11}, { 127,10}, { 271,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1215,11}, \
- { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383,11}, \
- { 207,10}, { 415, 9}, { 831, 8}, { 1663,10}, \
- { 447,13}, { 63,12}, { 127,11}, { 271,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 575, 9}, \
- { 1151,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 415,10}, \
- { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
- { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
- { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 415,11}, { 831,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 607,13}, { 319,12}, \
- { 703,11}, { 1407,13}, { 383,12}, { 831,13}, \
- { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
- { 1279,13}, { 703,14}, { 383,13}, { 831,12}, \
- { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \
- { 1151,14}, { 639,13}, { 1407,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
- { 1023,13}, { 2047,14}, { 1151,13}, { 2303,14}, \
- { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \
- { 1023,14}, { 2303,15}, { 1279,14}, { 2815,15}, \
- { 1535,14}, { 3199,15}, { 1791,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 183
-#define SQR_FFT_THRESHOLD 1728
-
-#define MULLO_BASECASE_THRESHOLD 19
-#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
-#define MULLO_MUL_N_THRESHOLD 4392
-
-#define DC_DIV_QR_THRESHOLD 15
-#define DC_DIVAPPR_Q_THRESHOLD 64
-#define DC_BDIV_QR_THRESHOLD 29
-#define DC_BDIV_Q_THRESHOLD 86
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 17
-#define INV_APPR_THRESHOLD 17
-
-#define BINV_NEWTON_THRESHOLD 111
-#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */
-#define REDC_2_TO_REDC_N_THRESHOLD 115
-
-#define MU_DIV_QR_THRESHOLD 680
-#define MU_DIVAPPR_Q_THRESHOLD 618
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 680
-#define MU_BDIV_Q_THRESHOLD 807
-
-#define POWM_SEC_TABLE 1,16,102,386,1985,2079
-
-#define MATRIX22_STRASSEN_THRESHOLD 12
-#define HGCD_THRESHOLD 46
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 1012
-#define GCD_DC_THRESHOLD 124
-#define GCDEXT_DC_THRESHOLD 138
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 20
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 324
-#define SET_STR_PRECOMPUTE_THRESHOLD 1043
-
-#define FAC_DSC_THRESHOLD 422
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/sparc64/ultrasparct1/add_n.asm b/gmp/mpn/sparc64/ultrasparct1/add_n.asm
deleted file mode 100644
index 954c7f6d35..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/add_n.asm
+++ /dev/null
@@ -1,68 +0,0 @@
-dnl SPARC v9 mpn_add_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: ?
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_add_nc)
- b,a L(ent)
-EPILOGUE()
-PROLOGUE(mpn_add_n)
- mov 0, cy
-L(ent): cmp %g0, cy
-L(top): ldx [up+0], %o4
- add up, 8, up
- ldx [vp+0], %o5
- add vp, 8, vp
- add rp, 8, rp
- add n, -1, n
- srlx %o4, 32, %g1
- srlx %o5, 32, %g2
- addccc %o4, %o5, %g3
- addccc %g1, %g2, %g0
- brgz n, L(top)
- stx %g3, [rp-8]
-
- retl
- addc %g0, %g0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm b/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm
deleted file mode 100644
index 313479773f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_addlsh1_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(func, mpn_addlsh1_n)
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n)
-
-include_mpn(`sparc64/ultrasparct1/addlshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm b/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm
deleted file mode 100644
index ee1afd0116..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_addlsh2_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(func, mpn_addlsh2_n)
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n)
-
-include_mpn(`sparc64/ultrasparct1/addlshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm b/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm
deleted file mode 100644
index 5be9a0d30a..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-dnl SPARC v9 mpn_addlshC_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C UltraSPARC T1: 21
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- mov 0, cy
- mov 0, %g5
- cmp %g0, cy
-L(top): ldx [up+0], %o4
- add up, 8, up
- ldx [vp+0], %o5
- add vp, 8, vp
- add rp, 8, rp
-
- sllx %o5, LSH, %g4
- add n, -1, n
- or %g5, %g4, %g4
- srlx %o5, RSH, %g5
-
- srlx %o4, 32, %g1
- srlx %g4, 32, %g2
- addccc %o4, %g4, %g3
- addccc %g1, %g2, %g0
- brgz n, L(top)
- stx %g3, [rp-8]
-
- retl
- addc %g5, %g0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm b/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm
deleted file mode 100644
index 29dba966f3..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-dnl SPARC v9 mpn_addmul_1 for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 74
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_addmul_1)
- save %sp, -176, %sp
- mov 1, %o2
- mov %i0, %g2
- srlx %i3, 32, %o4
- sllx %o2, 32, %o2
- srl %i3, 0, %i3
- mov 0, %g3
- mov 0, %i0
-
-L(top): ldx [%i1+%g3], %g1
- srl %g1, 0, %g4
- mulx %g4, %i3, %o5
- srlx %g1, 32, %g1
- mulx %g1, %i3, %g5
- mulx %g4, %o4, %g4
- mulx %g1, %o4, %g1
- srlx %o5, 32, %o1
- add %g5, %o1, %o1
- addcc %o1, %g4, %g4
- srl %o5, 0, %o0
- ldx [%g2+%g3], %o5
- sllx %g4, 32, %o1
- add %g1, %o2, %l1
- movlu %xcc, %l1, %g1
- add %o1, %o0, %l0
- addcc %l0, %i0, %g5
- srlx %g4, 32, %i0
- add %i0, 1, %g4
- movlu %xcc, %g4, %i0
- addcc %o5, %g5, %g5
- stx %g5, [%g2+%g3]
- add %i0, 1, %g4
- movlu %xcc, %g4, %i0
- add %i2, -1, %i2
- add %i0, %g1, %i0
- brnz,pt %i2, L(top)
- add %g3, 8, %g3
- return %i7+8
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h b/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h
deleted file mode 100644
index 99db78ac0f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 1000 MHz ultrasparc t1 running GNU/Linux */
-
-#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1_1P_METHOD 2
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 13
-#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 34
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 8
-#define MUL_TOOM33_THRESHOLD 50
-#define MUL_TOOM44_THRESHOLD 99
-#define MUL_TOOM6H_THRESHOLD 125
-#define MUL_TOOM8H_THRESHOLD 187
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 14
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 133
-#define SQR_TOOM6_THRESHOLD 156
-#define SQR_TOOM8_THRESHOLD 260
-
-#define MULMID_TOOM42_THRESHOLD 12
-
-#define MULMOD_BNM1_THRESHOLD 7
-#define SQRMOD_BNM1_THRESHOLD 7
-
-#define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \
- { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 9, 8}, \
- { 5, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \
- { 32, 7}, { 24, 8}, { 21, 9}, { 11, 8}, \
- { 23,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
- { 19, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
- { 43,10}, { 23,11}, { 15,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 67,10}, { 39, 9}, \
- { 79, 8}, { 159,10}, { 47, 9}, { 95,11}, \
- { 2048,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 53
-#define MUL_FFT_THRESHOLD 1728
-
-
-#define SQR_FFT_MODF_THRESHOLD 148 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 148, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \
- { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
- { 7, 7}, { 16, 8}, { 9, 6}, { 38, 7}, \
- { 20, 8}, { 11, 7}, { 24, 8}, { 13, 9}, \
- { 7, 7}, { 30, 8}, { 19, 9}, { 11, 8}, \
- { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \
- { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \
- { 39,10}, { 23, 9}, { 47, 8}, { 95, 9}, \
- { 51,11}, { 15,10}, { 31, 8}, { 127,10}, \
- { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \
- { 95,11}, { 2048,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 58
-#define SQR_FFT_THRESHOLD 1344
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 3176
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 106
-#define DC_BDIV_QR_THRESHOLD 27
-#define DC_BDIV_Q_THRESHOLD 62
-
-#define INV_MULMOD_BNM1_THRESHOLD 14
-#define INV_NEWTON_THRESHOLD 163
-#define INV_APPR_THRESHOLD 117
-
-#define BINV_NEWTON_THRESHOLD 166
-#define REDC_1_TO_REDC_N_THRESHOLD 31
-
-#define MU_DIV_QR_THRESHOLD 734
-#define MU_DIVAPPR_Q_THRESHOLD 748
-#define MUPI_DIV_QR_THRESHOLD 67
-#define MU_BDIV_QR_THRESHOLD 562
-#define MU_BDIV_Q_THRESHOLD 734
-
-#define POWM_SEC_TABLE 4,29,188,643,2741
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 58
-#define HGCD_APPR_THRESHOLD 55
-#define HGCD_REDUCE_THRESHOLD 637
-#define GCD_DC_THRESHOLD 186
-#define GCDEXT_DC_THRESHOLD 140
-#define JACOBI_BASE_METHOD 3
-
-#define GET_STR_DC_THRESHOLD 20
-#define GET_STR_PRECOMPUTE_THRESHOLD 33
-#define SET_STR_DC_THRESHOLD 268
-#define SET_STR_PRECOMPUTE_THRESHOLD 960
-
-#define FAC_DSC_THRESHOLD 268
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/sparc64/ultrasparct1/mul_1.asm b/gmp/mpn/sparc64/ultrasparct1/mul_1.asm
deleted file mode 100644
index 1fea2a19ef..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/mul_1.asm
+++ /dev/null
@@ -1,82 +0,0 @@
-dnl SPARC v9 mpn_mul_1 for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 68
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_mul_1)
- save %sp, -176, %sp
- mov 1, %o2
- mov %i0, %g2
- srlx %i3, 32, %o4
- sllx %o2, 32, %o2
- srl %i3, 0, %i3
- mov 0, %g3
- mov 0, %i0
-
-L(top): ldx [%i1+%g3], %g1
- srl %g1, 0, %g4
- mulx %g4, %i3, %o5
- srlx %g1, 32, %g1
- mulx %g1, %i3, %g5
- mulx %g4, %o4, %g4
- mulx %g1, %o4, %g1
- srlx %o5, 32, %o1
- add %g5, %o1, %o1
- addcc %o1, %g4, %g4
- srl %o5, 0, %o0
- sllx %g4, 32, %o1
- add %g1, %o2, %l1
- movlu %xcc, %l1, %g1
- add %o1, %o0, %l0
- addcc %l0, %i0, %g5
- srlx %g4, 32, %i0
- add %i0, 1, %g4
- movlu %xcc, %g4, %i0
- stx %g5, [%g2+%g3]
- add %i2, -1, %i2
- add %i0, %g1, %i0
- brnz,pt %i2, L(top)
- add %g3, 8, %g3
- return %i7+8
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm b/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm
deleted file mode 100644
index 51bd4ab45b..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_rsblsh1_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(func, mpn_rsblsh1_n)
-
-MULFUNC_PROLOGUE(mpn_rsblsh1_n)
-
-include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm b/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm
deleted file mode 100644
index f0d208e198..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_rsblsh2_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(func, mpn_rsblsh2_n)
-
-MULFUNC_PROLOGUE(mpn_rsblsh2_n)
-
-include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm b/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm
deleted file mode 100644
index 7c03e9f97f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-dnl SPARC v9 mpn_rsblshC_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C UltraSPARC T1: 21
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- mov 0, cy
- mov 0, %g5
- cmp %g0, cy
-L(top): ldx [up+0], %o4
- add up, 8, up
- ldx [vp+0], %o5
- add vp, 8, vp
- add rp, 8, rp
-
- sllx %o5, LSH, %g4
- add n, -1, n
- or %g5, %g4, %g4
- srlx %o5, RSH, %g5
-
- srlx %o4, 32, %g1
- srlx %g4, 32, %g2
- subccc %g4, %o4, %g3
- subccc %g2, %g1, %g0
- brgz n, L(top)
- stx %g3, [rp-8]
-
- retl
- subc %g5, %g0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/sub_n.asm b/gmp/mpn/sparc64/ultrasparct1/sub_n.asm
deleted file mode 100644
index c2af89f08f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/sub_n.asm
+++ /dev/null
@@ -1,68 +0,0 @@
-dnl SPARC v9 mpn_sub_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: ?
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_sub_nc)
- b,a L(ent)
-EPILOGUE()
-PROLOGUE(mpn_sub_n)
- mov 0, cy
-L(ent): cmp %g0, cy
-L(top): ldx [up+0], %o4
- add up, 8, up
- ldx [vp+0], %o5
- add vp, 8, vp
- add rp, 8, rp
- add n, -1, n
- srlx %o4, 32, %g1
- srlx %o5, 32, %g2
- subccc %o4, %o5, %g3
- subccc %g1, %g2, %g0
- brgz n, L(top)
- stx %g3, [rp-8]
-
- retl
- addc %g0, %g0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm b/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm
deleted file mode 100644
index 8c8fa80401..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_sublsh1_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(func, mpn_sublsh1_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
-
-include_mpn(`sparc64/ultrasparct1/sublshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm b/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm
deleted file mode 100644
index 2fd5eee71a..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm
+++ /dev/null
@@ -1,41 +0,0 @@
-dnl SPARC v9 mpn_sublsh2_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(func, mpn_sublsh2_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh2_n)
-
-include_mpn(`sparc64/ultrasparct1/sublshC_n.asm')
diff --git a/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm b/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm
deleted file mode 100644
index 01eafef1bc..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-dnl SPARC v9 mpn_sublshC_n for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C UltraSPARC T1: 21
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- mov 0, cy
- mov 0, %g5
- cmp %g0, cy
-L(top): ldx [up+0], %o4
- add up, 8, up
- ldx [vp+0], %o5
- add vp, 8, vp
- add rp, 8, rp
-
- sllx %o5, LSH, %g4
- add n, -1, n
- or %g5, %g4, %g4
- srlx %o5, RSH, %g5
-
- srlx %o4, 32, %g1
- srlx %g4, 32, %g2
- subccc %o4, %g4, %g3
- subccc %g1, %g2, %g0
- brgz n, L(top)
- stx %g3, [rp-8]
-
- retl
- addc %g5, %g0, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct1/submul_1.asm b/gmp/mpn/sparc64/ultrasparct1/submul_1.asm
deleted file mode 100644
index 4f553a8063..0000000000
--- a/gmp/mpn/sparc64/ultrasparct1/submul_1.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-dnl SPARC v9 mpn_submul_1 for T1/T2.
-
-dnl Copyright 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: 74
-C UltraSPARC T2: ?
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_submul_1)
- save %sp, -176, %sp
- mov 1, %o2
- mov %i0, %g2
- srlx %i3, 32, %o4
- sllx %o2, 32, %o2
- srl %i3, 0, %i3
- mov 0, %g3
- mov 0, %i0
-
-L(top): ldx [%i1+%g3], %g1
- srl %g1, 0, %g4
- mulx %g4, %i3, %o5
- srlx %g1, 32, %g1
- mulx %g1, %i3, %g5
- mulx %g4, %o4, %g4
- mulx %g1, %o4, %g1
- srlx %o5, 32, %o1
- add %g5, %o1, %o1
- addcc %o1, %g4, %g4
- srl %o5, 0, %o0
- ldx [%g2+%g3], %o5
- sllx %g4, 32, %o1
- add %g1, %o2, %l1
- movlu %xcc, %l1, %g1
- add %o1, %o0, %l0
- addcc %l0, %i0, %g5
- srlx %g4, 32, %i0
- add %i0, 1, %g4
- movlu %xcc, %g4, %i0
- subcc %o5, %g5, %g5
- stx %g5, [%g2+%g3]
- add %i0, 1, %g4
- movlu %xcc, %g4, %i0
- add %i2, -1, %i2
- add %i0, %g1, %i0
- brnz,pt %i2, L(top)
- add %g3, 8, %g3
- return %i7+8
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/add_n.asm b/gmp/mpn/sparc64/ultrasparct3/add_n.asm
deleted file mode 100644
index 0170746895..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/add_n.asm
+++ /dev/null
@@ -1,126 +0,0 @@
-dnl SPARC v9 mpn_add_n for T3/T4.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 8
-C UltraSPARC T4: 3
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`vp', `%i2')
-define(`n', `%i3')
-define(`cy', `%i4')
-
-define(`u0_off', `%l2')
-define(`u1_off', `%l3')
-define(`loop_n', `%l6')
-define(`tmp', `%l7')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_add_nc)
- save %sp, -176, %sp
- b,a L(ent)
-EPILOGUE()
-PROLOGUE(mpn_add_n)
- save %sp, -176, %sp
-
- mov 0, cy
-L(ent):
- subcc n, 1, n
- be L(final_one)
- cmp %g0, cy
-
- ldx [up + 0], %o4
- sllx n, 3, tmp
-
- ldx [vp + 0], %o5
- add up, tmp, u0_off
-
- ldx [up + 8], %g5
- neg tmp, loop_n
-
- ldx [vp + 8], %g1
- add u0_off, 8, u1_off
-
- sub loop_n, -(2 * 8), loop_n
-
- brgez,pn loop_n, L(loop_tail)
- add vp, (2 * 8), vp
-
- b,a L(top)
- ALIGN(16)
-L(top):
- addxccc(%o4, %o5, tmp)
- ldx [vp + 0], %o5
-
- add rp, (2 * 8), rp
- ldx [loop_n + u0_off], %o4
-
- add vp, (2 * 8), vp
- stx tmp, [rp - 16]
-
- addxccc(%g1, %g5, tmp)
- ldx [vp - 8], %g1
-
- ldx [loop_n + u1_off], %g5
- sub loop_n, -(2 * 8), loop_n
-
- brlz loop_n, L(top)
- stx tmp, [rp - 8]
-
-L(loop_tail):
- addxccc(%o4, %o5, %g3)
- add loop_n, u0_off, up
-
- addxccc(%g1, %g5, %g5)
- stx %g3, [rp + 0]
-
- brgz,pt loop_n, L(done)
- stx %g5, [rp + 8]
-
- add rp, (2 * 8), rp
-L(final_one):
- ldx [up+0], %o4
- ldx [vp+0], %o5
- addxccc(%o4, %o5, %g3)
- stx %g3, [rp+0]
-
-L(done):
- addxc(%g0, %g0, %i0)
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm b/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm
deleted file mode 100644
index 939811e1ce..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm
+++ /dev/null
@@ -1,182 +0,0 @@
-dnl SPARC v9 mpn_addmul_1 for T3/T4/T5.
-
-dnl Contributed to the GNU project by David Miller and Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 26
-C UltraSPARC T4: 4.5
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-define(`u0', `%l0')
-define(`u1', `%l1')
-define(`u2', `%l2')
-define(`u3', `%l3')
-define(`r0', `%l4')
-define(`r1', `%l5')
-define(`r2', `%l6')
-define(`r3', `%l7')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_addmul_1)
- save %sp, -176, %sp
- ldx [up+0], %g1
-
- and n, 3, %g3
- brz %g3, L(b0)
- addcc %g0, %g0, %g5 C clear carry limb, flag
- cmp %g3, 2
- bcs %xcc, L(b01)
- nop
- be %xcc, L(b10)
- ldx [up+8], %g5
-
-L(b11): ldx [up+16], u3
- mulx %g1, v0, %o2
- umulxhi(%g1, v0, %o3)
- ldx [rp+0], r1
- mulx %g5, v0, %o4
- ldx [rp+8], r2
- umulxhi(%g5, v0, %o5)
- ldx [rp+16], r3
- mulx u3, v0, %g4
- umulxhi(u3, v0, %g5)
- addcc %o3, %o4, %o4
- addxccc(%o5, %g4, %g4)
- addxc( %g0, %g5, %g5)
- addcc r1, %o2, r1
- stx r1, [rp+0]
- addxccc(r2, %o4, r2)
- stx r2, [rp+8]
- addxccc(r3, %g4, r3)
- stx r3, [rp+16]
- add n, -3, n
- add up, 24, up
- brz n, L(xit)
- add rp, 24, rp
- b L(com)
- nop
-
-L(b10): mulx %g1, v0, %o4
- ldx [rp+0], r2
- umulxhi(%g1, v0, %o5)
- ldx [rp+8], r3
- mulx %g5, v0, %g4
- umulxhi(%g5, v0, %g5)
- addcc %o5, %g4, %g4
- addxc( %g0, %g5, %g5)
- addcc r2, %o4, r2
- stx r2, [rp+0]
- addxccc(r3, %g4, r3)
- stx r3, [rp+8]
- add n, -2, n
- add up, 16, up
- brz n, L(xit)
- add rp, 16, rp
- b L(com)
- nop
-
-L(b01): ldx [rp+0], r3
- mulx %g1, v0, %g4
- umulxhi(%g1, v0, %g5)
- addcc r3, %g4, r3
- stx r3, [rp+0]
- add n, -1, n
- add up, 8, up
- brz n, L(xit)
- add rp, 8, rp
-
-L(com): ldx [up+0], %g1
-L(b0): ldx [up+8], u1
- ldx [up+16], u2
- ldx [up+24], u3
- mulx %g1, v0, %o0
- umulxhi(%g1, v0, %o1)
- b L(lo0)
- nop
-
- ALIGN(16)
-L(top): ldx [up+0], u0
- addxc( %g0, %g5, %g5) C propagate carry into carry limb
- ldx [up+8], u1
- addcc r0, %o0, r0
- ldx [up+16], u2
- addxccc(r1, %o2, r1)
- ldx [up+24], u3
- addxccc(r2, %o4, r2)
- stx r0, [rp-32]
- addxccc(r3, %g4, r3)
- stx r1, [rp-24]
- mulx u0, v0, %o0
- stx r2, [rp-16]
- umulxhi(u0, v0, %o1)
- stx r3, [rp-8]
-L(lo0): mulx u1, v0, %o2
- ldx [rp+0], r0
- umulxhi(u1, v0, %o3)
- ldx [rp+8], r1
- mulx u2, v0, %o4
- ldx [rp+16], r2
- umulxhi(u2, v0, %o5)
- ldx [rp+24], r3
- mulx u3, v0, %g4
- addxccc(%g5, %o0, %o0)
- umulxhi(u3, v0, %g5)
- add up, 32, up
- addxccc(%o1, %o2, %o2)
- add rp, 32, rp
- addxccc(%o3, %o4, %o4)
- add n, -4, n
- addxccc(%o5, %g4, %g4)
- brgz n, L(top)
- nop
-
- addxc( %g0, %g5, %g5)
- addcc r0, %o0, r0
- stx r0, [rp-32]
- addxccc(r1, %o2, r1)
- stx r1, [rp-24]
- addxccc(r2, %o4, r2)
- stx r2, [rp-16]
- addxccc(r3, %g4, r3)
- stx r3, [rp-8]
-L(xit): addxc( %g0, %g5, %i0)
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm b/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm
deleted file mode 100644
index ccc6a4408d..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm
+++ /dev/null
@@ -1,228 +0,0 @@
-dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb
-C mul_2 addmul_2
-C UltraSPARC T3: 22.5 23.5
-C UltraSPARC T4: 3.25 3.75
-
-
-C The code is reasonably scheduled but also relies on OoO. There was hope that
-C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per
-C iteration needs to be removed.
-C
-C We could almost use 2-way unrolling, but currently the wN registers live too
-C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down-
-C wards, 2-way unrolling should become possible. With n-indexed addressing it
-C should run no slower.
-C
-C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could
-C be postponed a full way, and then just one register could be used.
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`vp', `%i3')
-
-define(`v0', `%o0')
-define(`v1', `%o1')
-
-define(`w0', `%o2')
-define(`w1', `%o3')
-define(`w2', `%o4')
-define(`w3', `%o5')
-
-ifdef(`OPERATION_mul_2',`
- define(`AM2', `')
- define(`ADDX', `addcc`'$1')
- define(`func', `mpn_mul_2')
-')
-ifdef(`OPERATION_addmul_2',`
- define(`AM2', `$1')
- define(`ADDX', `addxccc($1,$2,$3)')
- define(`func', `mpn_addmul_2')
-')
-
-
-MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- save %sp, -176, %sp
-
- ldx [vp+0], v0 C load v0
- and n, 3, %g5
- ldx [vp+8], v1 C load v1
- add n, -6, n
- ldx [up+0], %g4
- brz %g5, L(b0)
- cmp %g5, 2
- bcs L(b1)
- nop
- be L(b2)
- nop
-
-L(b3):
-AM2(` ldx [rp+0], %g1')
- mulx %g4, v0, w2
- umulxhi(%g4, v0, w3)
- ldx [up+8], %i5
- mulx %g4, v1, %l3
- umulxhi(%g4, v1, %l7)
-AM2(` ldx [rp+8], %g3')
- add up, -8, up
- add rp, -8, rp
- b L(lo3)
- mov 0, w0
-
-L(b2):
-AM2(` ldx [rp+0], %g3')
- mulx %g4, v0, w3
- umulxhi(%g4, v0, w0)
- ldx [up+8], %i4
- mulx %g4, v1, %l1
- umulxhi(%g4, v1, %l5)
-AM2(` ldx [rp+8], %g1')
- add rp, 16, rp
- brlz n, L(end)
- mov 0, w1
- ba L(top)
- add up, 16, up
-
-L(b1):
-AM2(` ldx [rp+0], %g1')
- mulx %g4, v0, w0
- umulxhi(%g4, v0, w1)
- ldx [up+8], %i5
- mulx %g4, v1, %l3
- umulxhi(%g4, v1, %l7)
-AM2(` ldx [rp+8], %g3')
- add up, 8, up
- add rp, 8, rp
- b L(lo1)
- mov 0, w2
-
-L(b0):
-AM2(` ldx [rp+0], %g3')
- mulx %g4, v0, w1
- umulxhi(%g4, v0, w2)
- ldx [up+8], %i4
- mulx %g4, v1, %l1
- umulxhi(%g4, v1, %l5)
-AM2(` ldx [rp+8], %g1')
- b L(lo0)
- mov 0, w3
-
- ALIGN(16) C cycle
-L(top): mulx %i4, v0, %l2 C 0->5
- umulxhi(%i4, v0, %l6) C 0->5
- ldx [up+0], %i5 C 1->6
-AM2(` addcc w3, %g3, w3') C 1
- stx w3, [rp-16] C 2
- ADDX(` %l1, w0, w0') C 2
- addxccc(%l5, w1, w1) C 3
- mulx %i4, v1, %l3 C 3->9
- umulxhi(%i4, v1, %l7) C 4->9
-AM2(` ldx [rp+0], %g3') C 4
- addcc %l2, w0, w0 C 5
- addxccc(%l6, w1, w1) C 5
- addxc( %g0, %g0, w2) C 6
-L(lo1): mulx %i5, v0, %l0 C 6
- umulxhi(%i5, v0, %l4) C 7
- ldx [up+8], %i4 C 7
-AM2(` addcc w0, %g1, w0') C 8
- stx w0, [rp-8] C 8
- ADDX(` %l3, w1, w1') C 9
- addxccc(%l7, w2, w2) C 9
- mulx %i5, v1, %l1 C 10
- umulxhi(%i5, v1, %l5) C 10
-AM2(` ldx [rp+8], %g1') C 11
- addcc %l0, w1, w1 C 11
- addxccc(%l4, w2, w2) C 12
- addxc( %g0, %g0, w3) C 12
-L(lo0): mulx %i4, v0, %l2 C 13
- umulxhi(%i4, v0, %l6) C 13
- ldx [up+16], %i5 C 14
-AM2(` addcc w1, %g3, w1') C 14
- stx w1, [rp+0] C 15
- ADDX(` %l1, w2, w2') C 15
- addxccc(%l5, w3, w3) C 16
- mulx %i4, v1, %l3 C 16
- umulxhi(%i4, v1, %l7) C 17
-AM2(` ldx [rp+16], %g3') C 17
- addcc %l2, w2, w2 C 18
- addxccc(%l6, w3, w3) C 18
- addxc( %g0, %g0, w0) C 19
-L(lo3): mulx %i5, v0, %l0 C 19
- umulxhi(%i5, v0, %l4) C 20
- ldx [up+24], %i4 C 20
-AM2(` addcc w2, %g1, w2') C 21
- stx w2, [rp+8] C 21
- ADDX(` %l3, w3, w3') C 22
- addxccc(%l7, w0, w0) C 22
- mulx %i5, v1, %l1 C 23
- umulxhi(%i5, v1, %l5) C 23
-AM2(` ldx [rp+24], %g1') C 24
- addcc %l0, w3, w3 C 24
- addxccc(%l4, w0, w0) C 25
- addxc( %g0, %g0, w1) C 25
- add up, 32, up
- add rp, 32, rp
- brgz n, L(top)
- add n, -4, n
-
-L(end): mulx %i4, v0, %l2
- umulxhi(%i4, v0, %l6)
-AM2(` addcc w3, %g3, w3')
- stx w3, [rp-16]
- ADDX(` %l1, w0, w0')
- addxccc(%l5, w1, w1)
- mulx %i4, v1, %l3
- umulxhi(%i4, v1, %l7)
- addcc %l2, w0, w0
- addxccc(%l6, w1, w1)
- addxc( %g0, %g0, w2)
-AM2(` addcc w0, %g1, w0')
- stx w0, [rp-8]
- ADDX(` %l3, w1, w1')
- stx w1, [rp+0]
- addxc(%l7, w2, %i0)
-
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm b/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm
deleted file mode 100644
index 845f6d6d69..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm
+++ /dev/null
@@ -1,219 +0,0 @@
-dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb
-C mul_4 addmul_4
-C UltraSPARC T3: 21.5 22.0
-C UltraSPARC T4: 2.625 2.75
-
-
-C The code is well-scheduled and relies on OoO very little. There is hope that
-C this will run at around 2.5 and 2.75 c/l respectively, on T4.
-
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`vp', `%i3')
-
-define(`v0', `%g1')
-define(`v1', `%o7')
-define(`v2', `%g2')
-define(`v3', `%i3')
-
-define(`w0', `%o0')
-define(`w1', `%o1')
-define(`w2', `%o2')
-define(`w3', `%o3')
-define(`w4', `%o4')
-
-define(`r0', `%o5')
-
-define(`u0', `%i4')
-define(`u1', `%i5')
-
-define(`rp0', `rp')
-define(`rp1', `%g3')
-define(`rp2', `%g4')
-define(`up0', `up')
-define(`up1', `%g5')
-
-ifdef(`OPERATION_mul_4',`
- define(`AM4', `')
- define(`ADDX', `addcc`'$1')
- define(`func', `mpn_mul_4')
-')
-ifdef(`OPERATION_addmul_4',`
- define(`AM4', `$1')
- define(`ADDX', `addxccc($1,$2,$3)')
- define(`func', `mpn_addmul_4')
-')
-
-
-MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- save %sp, -176, %sp
-
- ldx [up + 0], u1 C load up[0] early
- andcc n, 1, %g0 C is n odd?
- ldx [vp + 0], v0
- sllx n, 3, n
- ldx [vp + 8], v1
- add n, -28, n
- ldx [vp + 16], v2
- add rp, -16, rp
- ldx [vp + 24], v3
- add up, n, up0
- add rp, n, rp0
- add up0, 8, up1
- add rp0, 8, rp1
- add rp0, 16, rp2
- mulx u1, v0, %l0
- mov 0, w0
- mulx u1, v1, %l1
- mov 0, w1
- mulx u1, v2, %l2
- mov 0, w2
- mulx u1, v3, %l3
- mov 0, w3
-
- be L(evn)
- neg n, n
-
-L(odd): mov u1, u0
- ldx [up1 + n], u1
-AM4(` ldx [rp2 + n], r0')
- umulxhi(u0, v0, %l4)
- umulxhi(u0, v1, %l5)
- umulxhi(u0, v2, %l6)
- umulxhi(u0, v3, %l7)
- b L(mid)
- add n, 8, n
-
-L(evn): ldx [up1 + n], u0
-AM4(` ldx [rp2 + n], r0')
- umulxhi(u1, v0, %l4)
- umulxhi(u1, v1, %l5)
- umulxhi(u1, v2, %l6)
- umulxhi(u1, v3, %l7)
- add n, 16, n
-
- ALIGN(16)
-L(top): addcc %l0, w0, w0
- mulx u0, v0, %l0 C w 0
- addxccc(%l1, w1, w1)
- mulx u0, v1, %l1 C w 1
- addxccc(%l2, w2, w2)
- mulx u0, v2, %l2 C w 2
- addxccc(%l3, w3, w3)
- mulx u0, v3, %l3 C w 3
- ldx [up0 + n], u1
- addxc( %g0, %g0, w4)
-AM4(` addcc r0, w0, w0')
- stx w0, [rp0 + n]
- ADDX(` %l4, w1, w0')
- umulxhi(u0, v0, %l4) C w 1
-AM4(` ldx [rp1 + n], r0')
- addxccc(%l5, w2, w1)
- umulxhi(u0, v1, %l5) C w 2
- addxccc(%l6, w3, w2)
- umulxhi(u0, v2, %l6) C w 3
- addxc( %l7, w4, w3)
- umulxhi(u0, v3, %l7) C w 4
-L(mid): addcc %l0, w0, w0
- mulx u1, v0, %l0 C w 1
- addxccc(%l1, w1, w1)
- mulx u1, v1, %l1 C w 2
- addxccc(%l2, w2, w2)
- mulx u1, v2, %l2 C w 3
- addxccc(%l3, w3, w3)
- mulx u1, v3, %l3 C w 4
- ldx [up1 + n], u0
- addxc( %g0, %g0, w4)
-AM4(` addcc r0, w0, w0')
- stx w0, [rp1 + n]
- ADDX(` %l4, w1, w0')
- umulxhi(u1, v0, %l4) C w 2
-AM4(` ldx [rp2 + n], r0')
- addxccc(%l5, w2, w1)
- umulxhi(u1, v1, %l5) C w 3
- addxccc(%l6, w3, w2)
- umulxhi(u1, v2, %l6) C w 4
- addxc( %l7, w4, w3)
- umulxhi(u1, v3, %l7) C w 5
- brlz n, L(top)
- add n, 16, n
-
-L(end): addcc %l0, w0, w0
- mulx u0, v0, %l0
- addxccc(%l1, w1, w1)
- mulx u0, v1, %l1
- addxccc(%l2, w2, w2)
- mulx u0, v2, %l2
- addxccc(%l3, w3, w3)
- mulx u0, v3, %l3
- addxc( %g0, %g0, w4)
-AM4(` addcc r0, w0, w0')
- stx w0, [rp0 + n]
- ADDX(` %l4, w1, w0')
- umulxhi(u0, v0, %l4)
-AM4(` ldx [rp1 + n], r0')
- addxccc(%l5, w2, w1)
- umulxhi(u0, v1, %l5)
- addxccc(%l6, w3, w2)
- umulxhi(u0, v2, %l6)
- addxc( %l7, w4, w3)
- umulxhi(u0, v3, %l7)
- addcc %l0, w0, w0
- addxccc(%l1, w1, w1)
- addxccc(%l2, w2, w2)
- addxccc(%l3, w3, w3)
- addxc( %g0, %g0, w4)
-AM4(` addcc r0, w0, w0')
- stx w0, [rp1 + n]
- ADDX(` %l4, w1, w0')
- addxccc(%l5, w2, w1)
- addxccc(%l6, w3, w2)
- stx w0, [rp2 + n]
- add n, 16, n
- stx w1, [rp1 + n]
- stx w2, [rp2 + n]
- addxc( %l7, w4, %i0)
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm b/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm
deleted file mode 100644
index 1014b1ba23..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl SPARC v9 mpn_addlsh_n and mpn_sublsh_n for T3/T4/T5.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 11
-C UltraSPARC T4: 4
-
-C For sublsh_n we combine the two shifted limbs using xnor, using the identity
-C (a xor not b) = (not (a xor b)) which equals (not (a or b)) when (a and b) =
-C 0 as it is in our usage. This gives us the ones complement for free.
-C Unfortunately, the same trick will not work for rsblsh_n, which will instead
-C require a separate negation.
-C
-C FIXME: Add rsblsh_n to this file.
-
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`vp', `%i2')
-define(`n', `%i3')
-define(`cnt',`%i4')
-
-define(`tnc',`%o5')
-
-ifdef(`OPERATION_addlsh_n',`
- define(`INITCY', `subcc %g0, 0, %g0')
- define(`MERGE', `or')
- define(`func', `mpn_addlsh_n')
-')
-ifdef(`OPERATION_sublsh_n',`
- define(`INITCY', `subcc %g0, 1, %g0')
- define(`MERGE', `xnor')
- define(`func', `mpn_sublsh_n')
-')
-
-define(`rp0', `rp')
-define(`rp1', `%o2')
-define(`up0', `up')
-define(`up1', `%o3')
-define(`vp0', `vp')
-define(`vp1', `%o4')
-
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_sublsh_n)
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- save %sp, -176, %sp
- mov 64, tnc
- sub tnc, cnt, tnc
-
- andcc n, 1, %g0
- sllx n, 3, n
- add n, -16, n
- add up, n, up0
- add vp, n, vp0
- add rp, n, rp0
- add up0, 8, up1
- add vp0, 8, vp1
- add rp0, -8, rp1
- add rp0, -16, rp0
- neg n, n
- be L(evn)
- INITCY
-
-L(odd): ldx [vp0 + n], %l1
- mov 0, %l2
- ldx [up0 + n], %l5
- sllx %l1, cnt, %g3
- brgez n, L(wd1)
- add n, 8, n
- ldx [vp0 + n], %l0
- b L(lo1)
- sllx %l1, cnt, %g3
-
-L(evn): ldx [vp0 + n], %l0
- mov 0, %l3
- ldx [up0 + n], %l4
- ldx [vp1 + n], %l1
- b L(lo0)
- sllx %l0, cnt, %g1
-
-L(top): addxccc(%l6, %l4, %o0)
- ldx [vp0 + n], %l0
- sllx %l1, cnt, %g3
- stx %o0, [rp0 + n]
-L(lo1): srlx %l1, tnc, %l3
- MERGE %l2, %g3, %l7
- ldx [up0 + n], %l4
- addxccc(%l7, %l5, %o1)
- ldx [vp1 + n], %l1
- sllx %l0, cnt, %g1
- stx %o1, [rp1 + n]
-L(lo0): srlx %l0, tnc, %l2
- MERGE %l3, %g1, %l6
- ldx [up1 + n], %l5
- brlz,pt n, L(top)
- add n, 16, n
-
- addxccc(%l6, %l4, %o0)
- sllx %l1, cnt, %g3
- stx %o0, [rp0 + n]
-L(wd1): srlx %l1, tnc, %l3
- MERGE %l2, %g3, %l7
- addxccc(%l7, %l5, %o1)
- stx %o1, [rp1 + n]
-
-ifdef(`OPERATION_addlsh_n',
-` addxc( %l3, %g0, %i0)')
-ifdef(`OPERATION_sublsh_n',
-` addxc( %g0, %g0, %g1)
- add %g1, -1, %g1
- sub %l3, %g1, %i0')
-
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm b/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm
deleted file mode 100644
index 550860d368..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl SPARC T3/T4/T5 mpn_bdiv_dbm1c.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 25
-C UltraSPARC T4/T5: 4
-
-C INPUT PARAMETERS
-define(`qp', `%i0')
-define(`ap', `%i1')
-define(`n', `%i2')
-define(`bd', `%i3')
-define(`h', `%i4')
-
-define(`plo0',`%g4') define(`plo1',`%g5')
-define(`phi0',`%l0') define(`phi1',`%l1')
-define(`a0', `%g1') define(`a1', `%g3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_bdiv_dbm1c)
- save %sp, -176, %sp
-
- and n, 3, %g5
- ldx [ap + 0], %g2
- add n, -5, n
- brz %g5, L(b0)
- cmp %g5, 2
- bcs %xcc, L(b1)
- nop
- be %xcc, L(b2)
- nop
-
-L(b3): ldx [ap + 8], a0
- mulx bd, %g2, plo1
- umulxhi(bd, %g2, phi1)
- ldx [ap + 16], a1
- add qp, -24, qp
- b L(lo3)
- add ap, -8, ap
-
-L(b2): ldx [ap + 8], a1
- mulx bd, %g2, plo0
- umulxhi(bd, %g2, phi0)
- brlz,pt n, L(wd2)
- nop
-L(gt2): ldx [ap + 16], a0
- add ap, 16, ap
- b L(lo2)
- add n, -1, n
-
-L(b1): mulx bd, %g2, plo1
- umulxhi(bd, %g2, phi1)
- brlz,pn n, L(wd1)
- add qp, -8, qp
-L(gt1): ldx [ap + 8], a0
- ldx [ap + 16], a1
- b L(lo1)
- add ap, 8, ap
-
-L(b0): ldx [ap + 8], a1
- mulx bd, %g2, plo0
- umulxhi(bd, %g2, phi0)
- ldx [ap + 16], a0
- b L(lo0)
- add qp, -16, qp
-
-L(top): ldx [ap + 0], a0
- sub h, phi1, h
-L(lo2): mulx bd, a1, plo1
- umulxhi(bd, a1, phi1)
- subcc h, plo0, h
- addxc( phi0, %g0, phi0)
- stx h, [qp + 0]
- ldx [ap + 8], a1
- sub h, phi0, h
-L(lo1): mulx bd, a0, plo0
- umulxhi(bd, a0, phi0)
- subcc h, plo1, h
- addxc( phi1, %g0, phi1)
- stx h, [qp + 8]
- ldx [ap + 16], a0
- sub h, phi1, h
-L(lo0): mulx bd, a1, plo1
- umulxhi(bd, a1, phi1)
- subcc h, plo0, h
- addxc( phi0, %g0, phi0)
- stx h, [qp + 16]
- ldx [ap + 24], a1
- sub h, phi0, h
-L(lo3): mulx bd, a0, plo0
- umulxhi(bd, a0, phi0)
- subcc h, plo1, h
- addxc( phi1, %g0, phi1)
- stx h, [qp + 24]
- add ap, 32, ap
- add qp, 32, qp
- brgz,pt n, L(top)
- add n, -4, n
-
-L(end): sub h, phi1, h
-L(wd2): mulx bd, a1, plo1
- umulxhi(bd, a1, phi1)
- subcc h, plo0, h
- addxc( phi0, %g0, phi0)
- stx h, [qp + 0]
- sub h, phi0, h
-L(wd1): subcc h, plo1, h
- addxc( phi1, %g0, phi1)
- stx h, [qp + 8]
- sub h, phi1, %i0
-
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm b/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm
deleted file mode 100644
index f10ee72c1f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm
+++ /dev/null
@@ -1,143 +0,0 @@
-dnl SPARC v9 mpn_cnd_add_n and mpn_cnd_sub_n for T3/T4/T5.
-
-dnl Contributed to the GNU project by David Miller and Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 8.5
-C UltraSPARC T4: 3
-
-C We use a double-pointer trick to allow indexed addressing. Its setup
-C cost might be a problem in these functions, since we don't expect huge n
-C arguments.
-C
-C For sub we need ~(a & mask) = (~a | ~mask) but by complementing mask we can
-C instead do ~(a & ~mask) = (~a | mask), allowing us to use the orn insn.
-
-C INPUT PARAMETERS
-define(`cnd', `%i0')
-define(`rp', `%i1')
-define(`up', `%i2')
-define(`vp', `%i3')
-define(`n', `%i4')
-
-define(`mask', `cnd')
-define(`up0', `%l0') define(`up1', `%l1')
-define(`vp0', `%l2') define(`vp1', `%l3')
-define(`rp0', `%g4') define(`rp1', `%g5')
-define(`u0', `%l4') define(`u1', `%l5')
-define(`v0', `%l6') define(`v1', `%l7')
-define(`x0', `%g1') define(`x1', `%g3')
-define(`w0', `%g1') define(`w1', `%g3')
-
-ifdef(`OPERATION_cnd_add_n',`
- define(`LOGOP', `and $1, $2, $3')
- define(`MAKEMASK',`cmp %g0, $1
- subc %g0, %g0, $2')
- define(`INITCY', `addcc %g0, 0, %g0')
- define(`RETVAL', `addxc( %g0, %g0, %i0)')
- define(`func', `mpn_cnd_add_n')
-')
-ifdef(`OPERATION_cnd_sub_n',`
- define(`LOGOP', `orn $2, $1, $3')
- define(`MAKEMASK',`cmp $1, 1
- subc %g0, %g0, $2')
- define(`INITCY', `subcc %g0, 1, %g0')
- define(`RETVAL', `addxc( %g0, %g0, %i0)
- xor %i0, 1, %i0')
- define(`func', `mpn_cnd_sub_n')
-')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(func)
- save %sp, -176, %sp
-
- MAKEMASK(cnd,mask)
-
- andcc n, 1, %g0
- sllx n, 3, n
- add n, -16, n
- add vp, n, vp0
- add up, n, up0
- add rp, n, rp0
- neg n, n
- be L(evn)
- INITCY
-
-L(odd): ldx [vp0 + n], v1
- ldx [up0 + n], u1
- LOGOP( v1, mask, x1)
- addxccc(u1, x1, w1)
- stx w1, [rp0 + n]
- add n, 8, n
- brgz n, L(rtn)
- nop
-
-L(evn): add vp0, 8, vp1
- add up0, 8, up1
- add rp0, -24, rp1
- ldx [vp0 + n], v0
- ldx [vp1 + n], v1
- ldx [up0 + n], u0
- ldx [up1 + n], u1
- add n, 16, n
- brgz n, L(end)
- add rp0, -16, rp0
-
-L(top): LOGOP( v0, mask, x0)
- ldx [vp0 + n], v0
- LOGOP( v1, mask, x1)
- ldx [vp1 + n], v1
- addxccc(u0, x0, w0)
- ldx [up0 + n], u0
- addxccc(u1, x1, w1)
- ldx [up1 + n], u1
- stx w0, [rp0 + n]
- add n, 16, n
- brlez n, L(top)
- stx w1, [rp1 + n]
-
-L(end): LOGOP( v0, mask, x0)
- LOGOP( v1, mask, x1)
- addxccc(u0, x0, w0)
- addxccc(u1, x1, w1)
- stx w0, [rp0 + n]
- stx w1, [rp1 + 32]
-
-L(rtn): RETVAL
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/dive_1.asm b/gmp/mpn/sparc64/ultrasparct3/dive_1.asm
deleted file mode 100644
index d7dbdf953c..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/dive_1.asm
+++ /dev/null
@@ -1,129 +0,0 @@
-dnl SPARC T3/T4/T5 mpn_divexact_1.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 31
-C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops
-
-C INPUT PARAMETERS
-define(`qp', `%i0')
-define(`ap', `%i1')
-define(`n', `%i2')
-define(`d', `%i3')
-
-define(`dinv',`%o4')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_divexact_1)
- save %sp, -176, %sp
- cmp n, 1
- bne,pt %xcc, L(gt1)
- ldx [ap], %o5
- udivx %o5, d, %g1
- stx %g1, [qp]
- return %i7+8
- nop
-
-L(gt1): add d, -1, %g1
- andn %g1, d, %g1
- popc %g1, %i4 C i4 = count_trailing_zeros(d)
-
- srlx d, %i4, d
- srlx d, 1, %g1
- and %g1, 127, %g1
-
- LEA64(binvert_limb_table, g2, g4)
- ldub [%g2+%g1], %g1
- add %g1, %g1, %g2
- mulx %g1, %g1, %g1
- mulx %g1, d, %g1
- sub %g2, %g1, %g2
- add %g2, %g2, %g1
- mulx %g2, %g2, %g2
- mulx %g2, d, %g2
- sub %g1, %g2, %g1
- add %g1, %g1, %o7
- mulx %g1, %g1, %g1
- mulx %g1, d, %g1
- add n, -2, n
- brz,pt %i4, L(norm)
- sub %o7, %g1, dinv
-
-L(unnorm):
- mov 0, %g4
- sub %g0, %i4, %o2
- srlx %o5, %i4, %o5
-L(top_unnorm):
- ldx [ap+8], %g3
- add ap, 8, ap
- sllx %g3, %o2, %g5
- or %g5, %o5, %g5
- srlx %g3, %i4, %o5
- subcc %g5, %g4, %g4
- mulx %g4, dinv, %g1
- stx %g1, [qp]
- add qp, 8, qp
- umulxhi(d, %g1, %g1)
- addxc( %g1, %g0, %g4)
- brgz,pt n, L(top_unnorm)
- add n, -1, n
-
- sub %o5, %g4, %g4
- mulx %g4, dinv, %g1
- stx %g1, [qp]
- return %i7+8
- nop
-
-L(norm):
- mulx dinv, %o5, %g1
- stx %g1, [qp]
- add qp, 8, qp
- addcc %g0, 0, %g4
-L(top_norm):
- umulxhi(d, %g1, %g1)
- ldx [ap+8], %g5
- add ap, 8, ap
- addxc( %g1, %g0, %g1)
- subcc %g5, %g1, %g1
- mulx %g1, dinv, %g1
- stx %g1, [qp]
- add qp, 8, qp
- brgz,pt n, L(top_norm)
- add n, -1, n
-
- return %i7+8
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/hamdist.asm b/gmp/mpn/sparc64/ultrasparct3/hamdist.asm
deleted file mode 100644
index 20ed8bf15b..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/hamdist.asm
+++ /dev/null
@@ -1,78 +0,0 @@
-dnl SPARC v9 mpn_hamdist for T3/T4.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 18
-C UltraSPARC T4: 3.5
-
-C INPUT PARAMETERS
-define(`up', `%o0')
-define(`vp', `%o1')
-define(`n', `%o2')
-define(`pcnt', `%o5')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_hamdist)
- subcc n, 1, n
- be L(final_one)
- clr pcnt
-L(top):
- ldx [up + 0], %g1
- ldx [vp + 0], %g2
- ldx [up + 8], %o4
- ldx [vp + 8], %g3
- sub n, 2, n
- xor %g1, %g2, %g1
- add up, 16, up
- popc %g1, %g2
- add vp, 16, vp
- xor %o4, %g3, %o4
- add pcnt, %g2, pcnt
- popc %o4, %g3
- brgz n, L(top)
- add pcnt, %g3, pcnt
- brlz,pt n, L(done)
- nop
-L(final_one):
- ldx [up + 0], %g1
- ldx [vp + 0], %g2
- xor %g1,%g2, %g1
- popc %g1, %g2
- add pcnt, %g2, pcnt
-L(done):
- retl
- mov pcnt, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm b/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm
deleted file mode 100644
index 4da49cf030..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm
+++ /dev/null
@@ -1,92 +0,0 @@
-dnl SPARC T3/T4/T5 mpn_invert_limb.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: ?
-C UltraSPARC T4/T5: ?
-
-C INPUT PARAMETERS
-define(`d', `%o0')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_invert_limb)
- srlx d, 54, %g1
- LEA64(approx_tab, g2, g3)
- and %g1, 0x1fe, %g1
- srlx d, 24, %g4
- lduh [%g2+%g1], %g3
- add %g4, 1, %g4
- sllx %g3, 11, %g2
- add %g2, -1, %g2
- mulx %g3, %g3, %g3
- mulx %g3, %g4, %g3
- srlx %g3, 40, %g3
- sub %g2, %g3, %g2
- sllx %g2, 60, %g1
- mulx %g2, %g2, %g3
- mulx %g3, %g4, %g4
- sub %g1, %g4, %g1
- srlx %g1, 47, %g1
- sllx %g2, 13, %g2
- add %g1, %g2, %g1
- and d, 1, %g2
- srlx %g1, 1, %g4
- sub %g0, %g2, %g3
- and %g4, %g3, %g3
- srlx d, 1, %g4
- add %g4, %g2, %g2
- mulx %g1, %g2, %g2
- sub %g3, %g2, %g2
- umulxhi(%g1, %g2, %g2)
- srlx %g2, 1, %g2
- sllx %g1, 31, %g1
- add %g2, %g1, %g1
- mulx %g1, d, %g3
- umulxhi(d, %g1, %g4)
- addcc %g3, d, %g0
- addxc( %g4, d, %o0)
- jmp %o7+8
- sub %g1, %o0, %o0
-EPILOGUE()
-
- RODATA
- ALIGN(2)
- TYPE( approx_tab, object)
- SIZE( approx_tab, 512)
-approx_tab:
-forloop(i,256,512-1,dnl
-` .half eval(0x7fd00/i)
-')dnl
diff --git a/gmp/mpn/sparc64/ultrasparct3/missing.asm b/gmp/mpn/sparc64/ultrasparct3/missing.asm
deleted file mode 100644
index c79032dd38..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/missing.asm
+++ /dev/null
@@ -1,77 +0,0 @@
-dnl SPARC v9-2011 simulation support.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(__gmpn_umulh)
- save %sp, -176, %sp
- ldx [%sp+2047+176+256], %o0
- ldx [%sp+2047+176+256+8], %o1
- rd %ccr, %o4
- srl %o0, 0, %l4
- srl %o1, 0, %l1
- srlx %o1, 32, %o1
- mulx %o1, %l4, %l2
- srlx %o0, 32, %o0
- mulx %o0, %l1, %l3
- mulx %l1, %l4, %l1
- srlx %l1, 32, %l1
- add %l2, %l1, %l2
- addcc %l2, %l3, %l2
- mulx %o1, %o0, %o1
- mov 0, %l1
- movcs %xcc, 1, %l1
- sllx %l1, 32, %l1
- add %o1, %l1, %o1
- srlx %l2, 32, %o0
- add %o1, %o0, %o0
- stx %o0, [%sp+2047+176+256]
- wr %o4, 0, %ccr
- ret
- restore
-EPILOGUE()
-
-PROLOGUE(__gmpn_lzcnt)
- save %sp, -176, %sp
- ldx [%sp+2047+176+256], %o0
- brz,a %o0, 2f
- mov 64, %o1
- brlz %o0, 2f
- mov 0, %o1
-1: sllx %o0, 1, %o0
- brgz %o0, 1b
- add %o1, 1, %o1
- stx %o1, [%sp+2047+176+256]
-2: ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/missing.m4 b/gmp/mpn/sparc64/ultrasparct3/missing.m4
deleted file mode 100644
index e5d6d8e98e..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/missing.m4
+++ /dev/null
@@ -1,88 +0,0 @@
-dnl SPARC v9-2011 simulation support.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-dnl Usage addxccc(r1,r2,r3, t1)
-dnl 64-bit add with carry-in and carry-out
-dnl FIXME: Register g2 must not be destination
-
-define(`addxccc',`dnl
- add %sp, -512, %sp
- stx %g2, [%sp+2047+256+16]
- mov 0, %g2
- movcs %xcc, -1, %g2
- addcc %g2, 1, %g0
- addccc $1, $2, $3
- ldx [%sp+2047+256+16], %g2
- sub %sp, -512, %sp
-')
-
-
-dnl Usage addxc(r1,r2,r3, t1,t2)
-dnl 64-bit add with carry-in
-
-define(`addxc',`dnl
- bcc %xcc, 1f
- add $1, $2, $3
- add $3, 1, $3
-1:
-')
-
-
-dnl Usage umulxhi(r1,r2,r3)
-dnl 64-bit multiply returning upper 64 bits
-dnl Calls __gmpn_umulh using a non-standard calling convention
-
-define(`umulxhi',`dnl
- add %sp, -512, %sp
- stx $1, [%sp+2047+256]
- stx $2, [%sp+2047+256+8]
- stx %o7, [%sp+2047+256+16]
- call __gmpn_umulh
- nop
- ldx [%sp+2047+256+16], %o7
- ldx [%sp+2047+256], $3
- sub %sp, -512, %sp
-')
-dnl Usage lzcnt(r1,r2)
-dnl Plain count leading zeros
-dnl Calls __gmpn_lzcnt using a non-standard calling convention
-
-define(`lzcnt',`dnl
- add %sp, -512, %sp
- stx %o7, [%sp+2047+256+16]
- call __gmpn_lzcnt
- stx $1, [%sp+2047+256]
- ldx [%sp+2047+256+16], %o7
- ldx [%sp+2047+256], $2
- sub %sp, -512, %sp
-')
diff --git a/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm b/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm
deleted file mode 100644
index 08facbd1cc..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-dnl SPARC T3/T4/T5 mpn_mod_1s_4p.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 30
-C UltraSPARC T4/T5: 4
-
-C INPUT PARAMETERS
-define(`ap', `%o0')
-define(`n', `%o1')
-define(`d', `%o2')
-define(`cps', `%o3')
-
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_mod_1s_4p)
- save %sp, -176, %sp
- ldx [%i3+16], %o4
- ldx [%i3+24], %o3
- ldx [%i3+32], %o2
- ldx [%i3+40], %o1
- ldx [%i3+48], %o0
-
- and %i1, 3, %g3
- sllx %i1, 3, %g1
- add %i0, %g1, %i0
- brz %g3, L(b00)
- cmp %g3, 2
- bcs %xcc, L(b01)
- nop
- be %xcc, L(b10)
- nop
-
-L(b11): ldx [%i0-16], %g2
- mulx %g2, %o4, %g5
- umulxhi(%g2, %o4, %g3)
- ldx [%i0-24], %g4
- addcc %g5, %g4, %g5
- addxc( %g3, %g0, %g4)
- ldx [%i0-8], %g2
- mulx %g2, %o3, %g1
- umulxhi(%g2, %o3, %g3)
- addcc %g1, %g5, %g1
- addxc( %g3, %g4, %g2)
- ba,pt %xcc, .L8
- add %i0, -32, %i0
-
-L(b00): ldx [%i0-24], %g3
- mulx %g3, %o4, %g2
- umulxhi(%g3, %o4, %g5)
- ldx [%i0-32], %g4
- addcc %g2, %g4, %g2
- addxc( %g5, %g0, %g3)
- ldx [%i0-16], %g4
- mulx %g4, %o3, %g5
- umulxhi(%g4, %o3, %i5)
- addcc %g2, %g5, %g5
- addxc( %g3, %i5, %g4)
- ldx [%i0-8], %g2
- mulx %g2, %o2, %g1
- umulxhi(%g2, %o2, %g3)
- addcc %g1, %g5, %g1
- addxc( %g3, %g4, %g2)
- ba,pt %xcc, .L8
- add %i0, -40, %i0
-
-L(b01): ldx [%i0-8], %g1
- mov 0, %g2
- ba,pt %xcc, .L8
- add %i0, -16, %i0
-
-L(b10): ldx [%i0-8], %g2
- ldx [%i0-16], %g1
- add %i0, -24, %i0
-
-.L8: add %i1, -5, %g3
- brlz,pn %g3, L(end)
- nop
-
-L(top): ldx [%i0-16], %i4
- mulx %i4, %o4, %o5
- umulxhi(%i4, %o4, %i1)
- ldx [%i0-24], %i5
- addcc %o5, %i5, %o5
- addxc( %i1, %g0, %i4)
- ldx [%i0-8], %i5
- mulx %i5, %o3, %o7
- umulxhi(%i5, %o3, %i1)
- addcc %o5, %o7, %o7
- addxc( %i4, %i1, %i5)
- ldx [%i0+0], %g4
- mulx %g4, %o2, %i1
- umulxhi(%g4, %o2, %i4)
- addcc %o7, %i1, %i1
- addxc( %i5, %i4, %g4)
- mulx %g1, %o1, %i5
- umulxhi(%g1, %o1, %i4)
- addcc %i1, %i5, %i5
- addxc( %g4, %i4, %g5)
- mulx %g2, %o0, %g1
- umulxhi(%g2, %o0, %g4)
- addcc %g1, %i5, %g1
- addxc( %g4, %g5, %g2)
- add %g3, -4, %g3
- brgez,pt %g3, L(top)
- add %i0, -32, %i0
-
-L(end): mulx %g2, %o4, %g5
- umulxhi(%g2, %o4, %g3)
- addcc %g1, %g5, %g5
- addxc( %g3, %g0, %g2)
- ldx [%i3+8], %i0
- ldx [%i3], %g4
- sub %g0, %i0, %i5
- srlx %g5, %i5, %i5
- sllx %g2, %i0, %g2
- or %i5, %g2, %g1
- mulx %g1, %g4, %l7
- umulxhi(%g1, %g4, %g3)
- sllx %g5, %i0, %g2
- add %g1, 1, %g1
- addcc %l7, %g2, %g5
- addxc( %g3, %g1, %g1)
- mulx %g1, %i2, %g1
- sub %g2, %g1, %g2
- cmp %g2, %g5
- add %i2, %g2, %g1
- movlu %xcc, %g2, %g1
- subcc %g1, %i2, %g2
- movgeu %xcc, %g2, %g1
- return %i7+8
- srlx %g1, %o0, %o0
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1s_4p_cps)
- save %sp, -176, %sp
- lzcnt( %i1, %i5)
- sllx %i1, %i5, %i1
- call mpn_invert_limb, 0
- mov %i1, %o0
- stx %o0, [%i0]
- sra %i5, 0, %g1
- stx %g1, [%i0+8]
- sub %g0, %i5, %g2
- srlx %o0, %g2, %g2
- mov 1, %g1
- sllx %g1, %i5, %g1
- or %g2, %g1, %g2
- sub %g0, %i1, %g1
- mulx %g2, %g1, %g2
- srlx %g2, %i5, %g1
- stx %g1, [%i0+16]
-
- umulxhi(%o0, %g2, %g3)
- add %g2, %g3, %g3
- xnor %g0, %g3, %g3
- mulx %g3, %i1, %g3
- mulx %g2, %o0, %g2
- cmp %g2, %g3
- add %i1, %g3, %g1
- movgeu %xcc, %g3, %g1
- srlx %g1, %i5, %g2
- stx %g2, [%i0+24]
-
- umulxhi(%o0, %g1, %g3)
- add %g1, %g3, %g3
- xnor %g0, %g3, %g3
- mulx %g3, %i1, %g3
- mulx %g1, %o0, %g1
- cmp %g1, %g3
- add %i1, %g3, %g2
- movgeu %xcc, %g3, %g2
- srlx %g2, %i5, %g1
- stx %g1, [%i0+32]
-
- umulxhi(%o0, %g2, %g3)
- add %g2, %g3, %g3
- xnor %g0, %g3, %g3
- mulx %g3, %i1, %g3
- mulx %g2, %o0, %g2
- cmp %g2, %g3
- add %i1, %g3, %g1
- movgeu %xcc, %g3, %g1
- srlx %g1, %i5, %g2
- stx %g2, [%i0+40]
-
- umulxhi(%o0, %g1, %g2)
- add %g1, %g2, %g2
- xnor %g0, %g2, %g2
- mulx %g2, %i1, %g2
- mulx %g1, %o0, %o0
- cmp %o0, %g2
- add %i1, %g2, %g3
- movgeu %xcc, %g2, %g3
- srlx %g3, %i5, %i5
- stx %i5, [%i0+48]
-
- return %i7+8
- nop
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm b/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm
deleted file mode 100644
index 874428069e..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm
+++ /dev/null
@@ -1,117 +0,0 @@
-dnl SPARC v9 mpn_mod_34lsub1 for T3/T4/T5.
-
-dnl Copyright 2005, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T1: -
-C UltraSPARC T3: 5
-C UltraSPARC T4: 1.57
-
-C This is based on the powerpc64/mode64 code.
-
-C INPUT PARAMETERS
-define(`up', `%i0')
-define(`n', `%i1')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_mod_34lsub1)
- save %sp, -176, %sp
-
- mov 0, %g1
- mov 0, %g3
- mov 0, %g4
- addcc %g0, 0, %g5
-
- add n, -3, n
- brlz n, L(lt3)
- nop
-
- add n, -3, n
- ldx [up+0], %l5
- ldx [up+8], %l6
- ldx [up+16], %l7
- brlz n, L(end)
- add up, 24, up
-
- ALIGN(16)
-L(top): addxccc(%g1, %l5, %g1)
- ldx [up+0], %l5
- addxccc(%g3, %l6, %g3)
- ldx [up+8], %l6
- addxccc(%g4, %l7, %g4)
- ldx [up+16], %l7
- add n, -3, n
- brgez n, L(top)
- add up, 24, up
-
-L(end): addxccc( %g1, %l5, %g1)
- addxccc(%g3, %l6, %g3)
- addxccc(%g4, %l7, %g4)
- addxc( %g5, %g0, %g5)
-
-L(lt3): cmp n, -2
- blt L(2)
- nop
-
- ldx [up+0], %l5
- mov 0, %l6
- beq L(1)
- addcc %g1, %l5, %g1
-
- ldx [up+8], %l6
-L(1): addxccc(%g3, %l6, %g3)
- addxccc(%g4, %g0, %g4)
- addxc( %g5, %g0, %g5)
-
-L(2): sllx %g1, 16, %l0
- srlx %l0, 16, %l0 C %l0 = %g1 mod 2^48
- srlx %g1, 48, %l3 C %l3 = %g1 div 2^48
- srl %g3, 0, %g1
- sllx %g1, 16, %l4 C %l4 = (%g3 mod 2^32) << 16
- srlx %g3, 32, %l5 C %l5 = %g3 div 2^32
- sethi %hi(0xffff0000), %g1
- andn %g4, %g1, %g1
- sllx %g1, 32, %l6 C %l6 = (%g4 mod 2^16) << 32
- srlx %g4, 16, %l7 C %l7 = %g4 div 2^16
-
- add %l0, %l3, %l0
- add %l4, %l5, %l4
- add %l6, %l7, %l6
-
- add %l0, %l4, %l0
- add %l6, %g5, %l6
-
- add %l0, %l6, %i0
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/mode1o.asm b/gmp/mpn/sparc64/ultrasparct3/mode1o.asm
deleted file mode 100644
index 494e1d3f4f..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/mode1o.asm
+++ /dev/null
@@ -1,82 +0,0 @@
-dnl SPARC T3/T4/T5 mpn_modexact_1c_odd.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 30
-C UltraSPARC T4/T5: 26
-
-C INPUT PARAMETERS
-define(`ap', `%o0')
-define(`n', `%o1')
-define(`d', `%o2')
-define(`cy', `%o3')
-
-define(`dinv',`%o5')
-define(`a0', `%g1')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_modexact_1c_odd)
- srlx d, 1, %g1
- and %g1, 127, %g1
-
- LEA64(binvert_limb_table, g2, g4)
- ldub [%g2+%g1], %g1
- add %g1, %g1, %g2
- mulx %g1, %g1, %g1
- mulx %g1, d, %g1
- sub %g2, %g1, %g2
- add %g2, %g2, %g1
- mulx %g2, %g2, %g2
- mulx %g2, d, %g2
- sub %g1, %g2, %g1
- add %g1, %g1, %o5
- mulx %g1, %g1, %g1
- mulx %g1, d, %g1
- sub %o5, %g1, dinv
- add n, -1, n
-
-L(top): ldx [ap], a0
- add ap, 8, ap
- subcc a0, cy, %g3
- mulx %g3, dinv, %g5
- umulxhi(d, %g5, %g5)
- addxc( %g5, %g0, cy)
- brnz,pt n, L(top)
- add n, -1, n
-
- retl
- mov cy, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/mul_1.asm b/gmp/mpn/sparc64/ultrasparct3/mul_1.asm
deleted file mode 100644
index af05d627bc..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/mul_1.asm
+++ /dev/null
@@ -1,174 +0,0 @@
-dnl SPARC v9 mpn_mul_1 for T3/T4/T5.
-
-dnl Contributed to the GNU project by David Miller and Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 23
-C UltraSPARC T4: 3
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_mul_1)
- save %sp, -176, %sp
-
- and n, 3, %g5
- add n, -4, n
- brz %g5, L(b0)
- cmp %g5, 2
- bcs %xcc, L(b1)
- nop
- be %xcc, L(b2)
- nop
-
-L(b3): addcc %g0, %g0, %i5
- ldx [up+0], %l0
- ldx [up+8], %l1
- ldx [up+16], %l2
- mulx %l0, v0, %o0
- umulxhi(%l0, v0, %o1)
- brgz n, L(gt3)
- add rp, -8, rp
- mulx %l1, v0, %o2
- umulxhi(%l1, v0, %o3)
- b L(wd3)
- nop
-L(gt3): ldx [up+24], %l3
- mulx %l1, v0, %o2
- umulxhi(%l1, v0, %o3)
- add up, 24, up
- b L(lo3)
- add n, -3, n
-
-L(b2): addcc %g0, %g0, %o1
- ldx [up+0], %l1
- ldx [up+8], %l2
- brgz n, L(gt2)
- add rp, -16, rp
- mulx %l1, v0, %o2
- umulxhi(%l1, v0, %o3)
- mulx %l2, v0, %o4
- umulxhi(%l2, v0, %o5)
- b L(wd2)
- nop
-L(gt2): ldx [up+16], %l3
- mulx %l1, v0, %o2
- umulxhi(%l1, v0, %o3)
- ldx [up+24], %l0
- mulx %l2, v0, %o4
- umulxhi(%l2, v0, %o5)
- add up, 16, up
- b L(lo2)
- add n, -2, n
-
-L(b1): addcc %g0, %g0, %o3
- ldx [up+0], %l2
- brgz n, L(gt1)
- nop
- mulx %l2, v0, %o4
- stx %o4, [rp+0]
- umulxhi(%l2, v0, %i0)
- ret
- restore
-L(gt1): ldx [up+8], %l3
- ldx [up+16], %l0
- mulx %l2, v0, %o4
- umulxhi(%l2, v0, %o5)
- ldx [up+24], %l1
- mulx %l3, v0, %i4
- umulxhi(%l3, v0, %i5)
- add rp, -24, rp
- add up, 8, up
- b L(lo1)
- add n, -1, n
-
-L(b0): addcc %g0, %g0, %o5
- ldx [up+0], %l3
- ldx [up+8], %l0
- ldx [up+16], %l1
- mulx %l3, v0, %i4
- umulxhi(%l3, v0, %i5)
- ldx [up+24], %l2
- mulx %l0, v0, %o0
- umulxhi(%l0, v0, %o1)
- b L(lo0)
- nop
-
- ALIGN(16)
-L(top): ldx [up+0], %l3 C 0
- addxccc(%i4, %o5, %i4) C 0
- mulx %l1, v0, %o2 C 1
- stx %i4, [rp+0] C 1
- umulxhi(%l1, v0, %o3) C 2
-L(lo3): ldx [up+8], %l0 C 2
- addxccc(%o0, %i5, %o0) C 3
- mulx %l2, v0, %o4 C 3
- stx %o0, [rp+8] C 4
- umulxhi(%l2, v0, %o5) C 4
-L(lo2): ldx [up+16], %l1 C 5
- addxccc(%o2, %o1, %o2) C 5
- mulx %l3, v0, %i4 C 6
- stx %o2, [rp+16] C 6
- umulxhi(%l3, v0, %i5) C 7
-L(lo1): ldx [up+24], %l2 C 7
- addxccc(%o4, %o3, %o4) C 8
- mulx %l0, v0, %o0 C 8
- stx %o4, [rp+24] C 9
- umulxhi(%l0, v0, %o1) C 9
- add rp, 32, rp C 10
-L(lo0): add up, 32, up C 10
- brgz n, L(top) C 11
- add n, -4, n C 11
-
-L(end): addxccc(%i4, %o5, %i4)
- mulx %l1, v0, %o2
- stx %i4, [rp+0]
- umulxhi(%l1, v0, %o3)
- addxccc(%o0, %i5, %o0)
-L(wd3): mulx %l2, v0, %o4
- stx %o0, [rp+8]
- umulxhi(%l2, v0, %o5)
- addxccc(%o2, %o1, %o2)
-L(wd2): stx %o2, [rp+16]
- addxccc(%o4, %o3, %o4)
- stx %o4, [rp+24]
- addxc( %g0, %o5, %i0)
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/popcount.asm b/gmp/mpn/sparc64/ultrasparct3/popcount.asm
deleted file mode 100644
index de80f3c809..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/popcount.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-dnl SPARC v9 mpn_popcount for T3/T4.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 15
-C UltraSPARC T4: 2.5
-
-C INPUT PARAMETERS
-define(`up', `%o0')
-define(`n', `%o1')
-define(`pcnt', `%o5')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_popcount)
- subcc n, 1, n
- be L(final_one)
- clr pcnt
-L(top):
- ldx [up + 0], %g1
- sub n, 2, n
- ldx [up + 8], %o4
- add up, 16, up
- popc %g1, %g2
- popc %o4, %g3
- add pcnt, %g2, pcnt
- brgz n, L(top)
- add pcnt, %g3, pcnt
- brlz,pt n, L(done)
- nop
-L(final_one):
- ldx [up + 0], %g1
- popc %g1, %g2
- add pcnt, %g2, pcnt
-L(done):
- retl
- mov pcnt, %o0
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm b/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm
deleted file mode 100644
index 216ddc0ba1..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,93 +0,0 @@
-dnl SPARC v9 mpn_sqr_dial_addlsh1 for T3/T4/T5.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: ?
-C UltraSPARC T4: >= 4.5
-
-
-define(`rp', `%i0')
-define(`tp', `%i1')
-define(`up', `%i2')
-define(`n', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_sqr_diag_addlsh1)
- save %sp, -176, %sp
-
- ldx [up+0], %g1
- mulx %g1, %g1, %o0
- umulxhi(%g1, %g1, %g2)
- stx %o0, [rp+0]
-
- ldx [up+8], %g1
- ldx [tp+0], %g4
- ldx [tp+8], %g5
- mulx %g1, %g1, %o0
- orcc %g0, %g0, %o5
- b L(dm)
- add n, -2, n
-
- ALIGN(16)
-L(top): ldx [up+8], %g1
- addcc %g4, %o2, %o2
- addxccc(%g5, %o0, %g3)
- ldx [tp+16], %g4
- ldx [tp+24], %g5
- mulx %g1, %g1, %o0
- stx %o2, [rp+8]
- stx %g3, [rp+16]
- add rp, 16, rp
- add tp, 16, tp
-L(dm): add %g2, %o5, %o2
- umulxhi(%g1, %g1, %g2)
- addxccc(%g4, %g4, %g4)
- addxccc(%g5, %g5, %g5)
- add up, 8, up
- addxc( %g0, %g0, %o5)
- brnz n, L(top)
- add n, -1, n
-
- addcc %o2, %g4, %g4
- addxccc(%o0, %g5, %g5)
- stx %g4, [rp+8]
- stx %g5, [rp+16]
- addxc( %o5, %g2, %g2)
- stx %g2, [rp+24]
-
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/sub_n.asm b/gmp/mpn/sparc64/ultrasparct3/sub_n.asm
deleted file mode 100644
index 0e4bc939e3..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/sub_n.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl SPARC v9 mpn_sub_n for T3/T4.
-
-dnl Contributed to the GNU project by David Miller.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 8
-C UltraSPARC T4: 3
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`vp', `%i2')
-define(`n', `%i3')
-define(`cy', `%i4')
-
-define(`u0_off', `%l0')
-define(`u1_off', `%l1')
-define(`v0_off', `%l2')
-define(`v1_off', `%l3')
-define(`r0_off', `%l4')
-define(`r1_off', `%l5')
-define(`loop_n', `%l6')
-define(`tmp', `%l7')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_sub_nc)
- save %sp, -176, %sp
- ba,pt %xcc, L(ent)
- xor cy, 1, cy
-EPILOGUE()
-PROLOGUE(mpn_sub_n)
- save %sp, -176, %sp
- mov 1, cy
-L(ent):
- subcc n, 1, n
- be L(final_one)
- cmp %g0, cy
-
- ldx [up + 0], %o4
- sllx n, 3, tmp
-
- ldx [vp + 0], %o5
- add up, tmp, u0_off
-
- ldx [up + 8], %g5
- add vp, tmp, v0_off
-
- ldx [vp + 8], %g1
- add rp, tmp, r0_off
-
- neg tmp, loop_n
- add u0_off, 8, u1_off
-
- add v0_off, 8, v1_off
- sub loop_n, -(2 * 8), loop_n
-
- sub r0_off, 16, r0_off
- brgez,pn loop_n, L(loop_tail)
- sub r0_off, 8, r1_off
-
- b,a L(top)
- ALIGN(16)
-L(top):
- xnor %o5, 0, tmp
- ldx [loop_n + v0_off], %o5
-
- addxccc(%o4, tmp, %g3)
- ldx [loop_n + u0_off], %o4
-
- xnor %g1, 0, %g1
- stx %g3, [loop_n + r0_off]
-
- addxccc(%g5, %g1, tmp)
- ldx [loop_n + v1_off], %g1
-
- ldx [loop_n + u1_off], %g5
- sub loop_n, -(2 * 8), loop_n
-
- brlz loop_n, L(top)
- stx tmp, [loop_n + r1_off]
-
-L(loop_tail):
- xnor %o5, 0, tmp
- xnor %g1, 0, %g1
-
- addxccc(%o4, tmp, %g3)
- add loop_n, u0_off, up
-
- addxccc(%g5, %g1, %g5)
- add loop_n, r0_off, rp
-
- stx %g3, [rp + 0]
- add loop_n, v0_off, vp
-
- brgz,pt loop_n, L(done)
- stx %g5, [rp + 8]
-
- add rp, (2 * 8), rp
-
-L(final_one):
- ldx [up+0], %o4
- ldx [vp+0], %o5
- xnor %o5, %g0, %o5
- addxccc(%o4, %o5, %g3)
- stx %g3, [rp+0]
-
-L(done):
- clr %i0
- movcc %xcc, 1, %i0
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/sparc64/ultrasparct3/submul_1.asm b/gmp/mpn/sparc64/ultrasparct3/submul_1.asm
deleted file mode 100644
index 5635d1bdbd..0000000000
--- a/gmp/mpn/sparc64/ultrasparct3/submul_1.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-dnl SPARC v9 mpn_submul_1 for T3/T4/T5.
-
-dnl Contributed to the GNU project by David Miller and Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C UltraSPARC T3: 26
-C UltraSPARC T4: 4.5
-
-C INPUT PARAMETERS
-define(`rp', `%i0')
-define(`up', `%i1')
-define(`n', `%i2')
-define(`v0', `%i3')
-
-ASM_START()
- REGISTER(%g2,#scratch)
- REGISTER(%g3,#scratch)
-PROLOGUE(mpn_submul_1)
- save %sp, -176, %sp
- ldx [up+0], %g1
-
- and n, 3, %g5
- add n, -4, n
- brz %g5, L(b00)
- cmp %g5, 2
- bcs %xcc, L(b01)
- nop
- bne %xcc, L(b11)
- ldx [up+8], %g4
-
-L(b10): add up, 16, up
- addcc %g0, 0, %g3
- mulx %g1, v0, %l4
- umulxhi(%g1, v0, %l5)
- ldx [rp+0], %o2
- mulx %g4, v0, %l6
- umulxhi(%g4, v0, %l7)
- brlz n, L(wd2)
- nop
-L(gt2): ldx [up+0], %o0
- b L(lo2)
- nop
-
-L(b00): add rp, -16, rp
- addcc %g0, 0, %g3
- ldx [up+8], %o1
- mulx %g1, v0, %l0
- umulxhi(%g1, v0, %l1)
- ldx [up+16], %o0
- ldx [rp+16], %o2
- mulx %o1, v0, %l2
- umulxhi(%o1, v0, %l3)
- b L(lo0)
- nop
-
-L(b01): add up, 8, up
- add rp, -8, rp
- addcc %g0, 0, %g3
- ldx [rp+8], %o3
- mulx %g1, v0, %l6
- umulxhi(%g1, v0, %l7)
- brlz n, L(wd1)
- nop
- ldx [up+0], %o0
- ldx [up+8], %o1
- mulx %o0, v0, %l0
- umulxhi(%o0, v0, %l1)
- b L(lo1)
- nop
-
-L(b11): add up, 24, up
- add rp, 8, rp
- addcc %g0, 0, %g3
- mulx %g1, v0, %l2
- umulxhi(%g1, v0, %l3)
- ldx [up-8], %o1
- ldx [rp-8], %o3
- mulx %g4, v0, %l4
- umulxhi(%g4, v0, %l5)
- brlz n, L(end)
- nop
-
- ALIGN(16)
-L(top): ldx [up+0], %o0
- addxccc(%g3, %l2, %g1)
- ldx [rp+0], %o2
- addxc( %g0, %l3, %g3)
- mulx %o1, v0, %l6
- subcc %o3, %g1, %g4
- umulxhi(%o1, v0, %l7)
- stx %g4, [rp-8]
-L(lo2): ldx [up+8], %o1
- addxccc(%g3, %l4, %g1)
- ldx [rp+8], %o3
- addxc( %g0, %l5, %g3)
- mulx %o0, v0, %l0
- subcc %o2, %g1, %g4
- umulxhi(%o0, v0, %l1)
- stx %g4, [rp+0]
-L(lo1): ldx [up+16], %o0
- addxccc(%g3, %l6, %g1)
- ldx [rp+16], %o2
- addxc( %g0, %l7, %g3)
- mulx %o1, v0, %l2
- subcc %o3, %g1, %g4
- umulxhi(%o1, v0, %l3)
- stx %g4, [rp+8]
-L(lo0): ldx [up+24], %o1
- addxccc(%g3, %l0, %g1)
- ldx [rp+24], %o3
- addxc( %g0, %l1, %g3)
- mulx %o0, v0, %l4
- subcc %o2, %g1, %g4
- umulxhi(%o0, v0, %l5)
- stx %g4, [rp+16]
- add n, -4, n
- add up, 32, up
- brgez n, L(top)
- add rp, 32, rp
-
-L(end): addxccc(%g3, %l2, %g1)
- ldx [rp+0], %o2
- addxc( %g0, %l3, %g3)
- mulx %o1, v0, %l6
- subcc %o3, %g1, %g4
- umulxhi(%o1, v0, %l7)
- stx %g4, [rp-8]
-L(wd2): addxccc(%g3, %l4, %g1)
- ldx [rp+8], %o3
- addxc( %g0, %l5, %g3)
- subcc %o2, %g1, %g4
- stx %g4, [rp+0]
-L(wd1): addxccc(%g3, %l6, %g1)
- addxc( %g0, %l7, %g3)
- subcc %o3, %g1, %g4
- stx %g4, [rp+8]
- addxc( %g0, %g3, %i0)
- ret
- restore
-EPILOGUE()
diff --git a/gmp/mpn/thumb/add_n.asm b/gmp/mpn/thumb/add_n.asm
deleted file mode 100644
index 08ed60b9be..0000000000
--- a/gmp/mpn/thumb/add_n.asm
+++ /dev/null
@@ -1,63 +0,0 @@
-dnl ARM/Thumb mpn_add_n.
-
-dnl Copyright 1997, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-define(`rp', r0)
-define(`up', r1)
-define(`vp', r2)
-define(`n', r3)
-
-ASM_START()
- .thumb
-PROLOGUE(mpn_add_nc)
- push {r4, r5, r6}
- ldr r6, [sp, #12] C init carry save register
- sub r6, #1
- b L(top)
-EPILOGUE()
-PROLOGUE(mpn_add_n)
- push {r4, r5, r6}
- neg r6, n C init carry save register
-
-L(top): ldmia up!, {r4} C load next limb from S1
- cmp n, r6 C tricky carry restore
- ldmia vp!, {r5} C load next limb from S2
- adc r4, r5
- stmia rp!, {r4} C store result limb to RES
- sbc r6, r6 C save negated carry
- sub n, #1
- bne L(top)
-
- add r0, r6, #1
- pop {r4, r5, r6}
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/thumb/add_n.s b/gmp/mpn/thumb/add_n.s
new file mode 100644
index 0000000000..294cfe6ae2
--- /dev/null
+++ b/gmp/mpn/thumb/add_n.s
@@ -0,0 +1,48 @@
+@ ARM/Thumb mpn_add_n -- Add two limb vectors of the same length > 0 and store
+@ sum in a third limb vector.
+
+@ Copyright 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 3 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr r0
+@ S1_ptr r1
+@ S2_ptr r2
+@ SIZE r3
+
+@ NOT TESTED CODE
+
+ .text
+ .thumb
+ .align 0
+ .global ___gmpn_add_n
+___gmpn_add_n:
+ push {r4, r5, r6, lr}
+ mov r6, #1 @ init carry save register
+
+Loop: sub r6, #1 @ restore carry (set iff r6 was 0)
+ ldmia r1!, {r4} @ load next limb from S1
+ ldmia r2!, {r5} @ load next limb from S2
+ adc r4, r5
+ stmia r0!, {r4} @ store result limb to RES
+ sbc r6, r6 @ save negated carry
+ sub r3, #1
+ bge Loop @ loop back while remaining count >= 4
+
+ mov r0, r6
+ pop {r4, r5, r6, pc}
diff --git a/gmp/mpn/thumb/sub_n.asm b/gmp/mpn/thumb/sub_n.asm
deleted file mode 100644
index a38572048e..0000000000
--- a/gmp/mpn/thumb/sub_n.asm
+++ /dev/null
@@ -1,63 +0,0 @@
-dnl ARM/Thumb mpn_sub_n.
-
-dnl Copyright 1997, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C INPUT PARAMETERS
-define(`rp', r0)
-define(`up', r1)
-define(`vp', r2)
-define(`n', r3)
-
-ASM_START()
- .thumb
-PROLOGUE(mpn_sub_nc)
- push {r4, r5, r6}
- ldr r6, [sp, #12] C init carry save register
- neg r6, r6
- b L(top)
-EPILOGUE()
-PROLOGUE(mpn_sub_n)
- push {r4, r5, r6}
- mov r6, n C init carry save register
-
-L(top): ldmia up!, {r4} C load next limb from S1
- cmp n, r6 C tricky carry restore
- ldmia vp!, {r5} C load next limb from S2
- sbc r4, r5
- stmia rp!, {r4} C store result limb to RES
- sbc r6, r6 C save negated carry
- sub n, #1
- bne L(top)
-
- neg r0, r6
- pop {r4, r5, r6}
- bx lr
-EPILOGUE()
diff --git a/gmp/mpn/thumb/sub_n.s b/gmp/mpn/thumb/sub_n.s
new file mode 100644
index 0000000000..fbd4c98194
--- /dev/null
+++ b/gmp/mpn/thumb/sub_n.s
@@ -0,0 +1,48 @@
+@ ARM/Thumb mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+@ store difference in a third limb vector.
+
+@ Copyright 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 3 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr r0
+@ S1_ptr r1
+@ S2_ptr r2
+@ SIZE r3
+
+@ NOT TESTED CODE
+
+ .text
+ .thumb
+ .align 0
+ .global ___gmpn_sub_n
+___gmpn_sub_n:
+ push {r4, r5, r6, lr}
+ mov r6, #1 @ init carry save register
+
+Loop: sub r6, #1 @ restore carry (set iff r6 was 0)
+ ldmia r1!, {r4} @ load next limb from S1
+ ldmia r2!, {r5} @ load next limb from S2
+ sbc r4, r5
+ stmia r0!, {r4} @ store result limb to RES
+ sbc r6, r6 @ save negated carry
+ sub r3, #1
+ bge Loop @ loop back while remaining count >= 4
+
+ mov r0, r6
+ pop {r4, r5, r6, pc}
diff --git a/gmp/mpn/vax/add_n.asm b/gmp/mpn/vax/add_n.asm
deleted file mode 100644
index 0a0bf78ab3..0000000000
--- a/gmp/mpn/vax/add_n.asm
+++ /dev/null
@@ -1,64 +0,0 @@
-dnl VAX mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
-dnl in a third limb vector.
-
-dnl Copyright 1999, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_add_n)
- .word 0x0
- movl 16(ap), r0
- movl 12(ap), r1
- movl 8(ap), r2
- movl 4(ap), r3
- mnegl r0, r5
- addl2 $3, r0
- ashl $-2, r0, r0 C unroll loop count
- bicl2 $-4, r5 C mask out low 2 bits
- movaq (r5)[r5], r5 C 9x
- jmp L(top)[r5]
-
-L(top): movl (r2)+, r4
- adwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- adwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- adwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- adwc (r1)+, r4
- movl r4, (r3)+
- sobgtr r0, L(top)
-
- adwc r0, r0
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/add_n.s b/gmp/mpn/vax/add_n.s
new file mode 100644
index 0000000000..60773cc348
--- /dev/null
+++ b/gmp/mpn/vax/add_n.s
@@ -0,0 +1,59 @@
+# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# s2_ptr (sp + 12)
+# size (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+ .word 0x0
+ movl 16(ap),r0
+ movl 12(ap),r1
+ movl 8(ap),r2
+ movl 4(ap),r3
+ mnegl r0,r5
+ addl2 $3,r0
+ ashl $-2,r0,r0 # unroll loop count
+ bicl2 $-4,r5 # mask out low 2 bits
+ movaq (r5)[r5],r5 # 9x
+ jmp Loop(r5)
+
+Loop: movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ adwc (r1)+,r4
+ movl r4,(r3)+
+ sobgtr r0,Loop
+
+ adwc r0,r0
+ ret
diff --git a/gmp/mpn/vax/addmul_1.asm b/gmp/mpn/vax/addmul_1.asm
deleted file mode 100644
index 8a6f636bdf..0000000000
--- a/gmp/mpn/vax/addmul_1.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl VAX mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
-dnl to a second limb vector.
-
-dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
- .word 0xfc0
- movl 12(ap), r4
- movl 8(ap), r8
- movl 4(ap), r9
- clrl r3
- incl r4
- ashl $-1, r4, r7
- clrl r11
- movl 16(ap), r6
- jlss L(v0_big)
- jlbc r4, L(1)
-
-C Loop for v0 < 0x80000000
-L(tp1): movl (r8)+, r1
- jlss L(1n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc $0, r3
- addl2 r2, (r9)+
- adwc $0, r3
-L(1): movl (r8)+, r1
- jlss L(1n1)
-L(1p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc $0, r11
- addl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(1n0): emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r6, r3
- addl2 r2, (r9)+
- adwc $0, r3
- movl (r8)+, r1
- jgeq L(1p1)
-L(1n1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r6, r11
- addl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(v0_big):
- jlbc r4, L(2)
-
-C Loop for v0 >= 0x80000000
-L(tp2): movl (r8)+, r1
- jlss L(2n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r1, r3
- addl2 r2, (r9)+
- adwc $0, r3
-L(2): movl (r8)+, r1
- jlss L(2n1)
-L(2p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r1, r11
- addl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-
-L(2n0): emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r6, r3
- addl2 r2, (r9)+
- adwc r1, r3
- movl (r8)+, r1
- jgeq L(2p1)
-L(2n1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r6, r11
- addl2 r10, (r9)+
- adwc r1, r11
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/addmul_1.s b/gmp/mpn/vax/addmul_1.s
new file mode 100644
index 0000000000..e2f86e074d
--- /dev/null
+++ b/gmp/mpn/vax/addmul_1.s
@@ -0,0 +1,124 @@
+# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ addl2 r2,(r9)+
+ adwc $0,r3
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ addl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ addl2 r2,(r9)+
+ adwc r1,r3
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ addl2 r10,(r9)+
+ adwc r1,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/gmp/mpn/vax/elf.m4 b/gmp/mpn/vax/elf.m4
deleted file mode 100644
index e04f0bafc9..0000000000
--- a/gmp/mpn/vax/elf.m4
+++ /dev/null
@@ -1,54 +0,0 @@
-divert(-1)
-
-dnl m4 macros for VAX assembler.
-
-dnl Copyright 2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-defreg(r0,`%r``''0')
-defreg(r1,`%r``''1')
-defreg(r2,`%r``''2')
-defreg(r3,`%r``''3')
-defreg(r4,`%r``''4')
-defreg(r5,`%r``''5')
-defreg(r6,`%r``''6')
-defreg(r7,`%r``''7')
-defreg(r8,`%r``''8')
-defreg(r9,`%r``''9')
-defreg(r10,`%r``''10')
-defreg(r11,`%r``''11')
-defreg(r12,`%r``''12')
-defreg(r13,`%r``''13')
-defreg(r14,`%r``''14')
-defreg(r15,`%r``''15')
-defreg(ap,`%a``''p')
-
-define(`foo', blablabla)
-
-divert
diff --git a/gmp/mpn/vax/gmp-mparam.h b/gmp/mpn/vax/gmp-mparam.h
index 9f20b9b783..ea262ddc40 100644
--- a/gmp/mpn/vax/gmp-mparam.h
+++ b/gmp/mpn/vax/gmp-mparam.h
@@ -1,41 +1,30 @@
/* VAX gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2000-2002 Free Software Foundation, Inc.
+Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* These numbers were measured manually using the tune/speed program.
- The standard tune/tuneup takes too long. (VAX 8800) */
+ The standard tune/tunup takes too long. (VAX 8800) */
-#define MUL_TOOM22_THRESHOLD 14
-#define MUL_TOOM33_THRESHOLD 110
+#define MUL_KARATSUBA_THRESHOLD 14
+#define MUL_TOOM3_THRESHOLD 110
#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 42
+#define SQR_KARATSUBA_THRESHOLD 42
#define SQR_TOOM3_THRESHOLD 250
/* #define DIV_SB_PREINV_THRESHOLD */
diff --git a/gmp/mpn/vax/lshift.asm b/gmp/mpn/vax/lshift.asm
deleted file mode 100644
index 941e9994b8..0000000000
--- a/gmp/mpn/vax/lshift.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-dnl VAX mpn_lshift -- left shift.
-
-dnl Copyright 1999-2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_lshift)
- .word 0x1c0
- movl 4(ap), r7
- movl 8(ap), r6
- movl 12(ap), r1
- movl 16(ap), r8
-
- moval (r6)[r1], r6
- moval (r7)[r1], r7
- clrl r3
- movl -(r6), r2
- ashq r8, r2, r4
- movl r5, r0
- movl r2, r3
- decl r1
- jeql L(end)
-
-L(top): movl -(r6), r2
- ashq r8, r2, r4
- movl r5, -(r7)
- movl r2, r3
- sobgtr r1, L(top)
-
-L(end): movl r4, -4(r7)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/lshift.s b/gmp/mpn/vax/lshift.s
new file mode 100644
index 0000000000..6f3d600be0
--- /dev/null
+++ b/gmp/mpn/vax/lshift.s
@@ -0,0 +1,56 @@
+# VAX mpn_lshift -- left shift.
+
+# Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# rptr (sp + 4)
+# sptr (sp + 8)
+# size (sp + 12)
+# cnt (sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers
+
+.text
+ .align 1
+.globl ___gmpn_lshift
+___gmpn_lshift:
+ .word 0x1c0
+ movl 4(ap),r7
+ movl 8(ap),r6
+ movl 12(ap),r1
+ movl 16(ap),r8
+
+ moval (r6)[r1],r6
+ moval (r7)[r1],r7
+ clrl r3
+ movl -(r6),r2
+ ashq r8,r2,r4
+ movl r5,r0
+ movl r2,r3
+ decl r1
+ jeql Lend
+
+Loop: movl -(r6),r2
+ ashq r8,r2,r4
+ movl r5,-(r7)
+ movl r2,r3
+ sobgtr r1,Loop
+
+Lend: movl r4,-4(r7)
+ ret
diff --git a/gmp/mpn/vax/mul_1.asm b/gmp/mpn/vax/mul_1.asm
deleted file mode 100644
index 8e4dcd2177..0000000000
--- a/gmp/mpn/vax/mul_1.asm
+++ /dev/null
@@ -1,118 +0,0 @@
-dnl VAX mpn_mul_1 -- Multiply a limb vector with a limb and store the result
-dnl in a second limb vector.
-
-dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_mul_1)
- .word 0xfc0
- movl 12(ap), r4
- movl 8(ap), r8
- movl 4(ap), r9
- clrl r3
- incl r4
- ashl $-1, r4, r7
- clrl r11
- movl 16(ap), r6
- jlss L(v0_big)
- jlbc r4, L(1)
-
-C Loop for v0 < 0x80000000
-L(tp1): movl (r8)+, r1
- jlss L(1n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc $0, r3
- movl r2, (r9)+
-L(1): movl (r8)+, r1
- jlss L(1n1)
-L(1p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc $0, r11
- movl r10, (r9)+
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(1n0): emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r6, r3
- movl r2, (r9)+
- movl (r8)+, r1
- jgeq L(1p1)
-L(1n1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r6, r11
- movl r10, (r9)+
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(v0_big):
- jlbc r4, L(2)
-
-C Loop for v0 >= 0x80000000
-L(tp2): movl (r8)+, r1
- jlss L(2n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r1, r3
- movl r2, (r9)+
-L(2): movl (r8)+, r1
- jlss L(2n1)
-L(2p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r1, r11
- movl r10, (r9)+
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-
-L(2n0): emul r1, r6, $0, r2
- addl2 r1, r3
- addl2 r11, r2
- adwc r6, r3
- movl r2, (r9)+
- movl (r8)+, r1
- jgeq L(2p1)
-L(2n1): emul r1, r6, $0, r10
- addl2 r1, r11
- addl2 r3, r10
- adwc r6, r11
- movl r10, (r9)+
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/mul_1.s b/gmp/mpn/vax/mul_1.s
new file mode 100644
index 0000000000..c6f4594bd8
--- /dev/null
+++ b/gmp/mpn/vax/mul_1.s
@@ -0,0 +1,121 @@
+# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests. (VAX 3600)
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ movl r2,(r9)+
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ movl r2,(r9)+
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ movl r2,(r9)+
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r1,r3
+ addl2 r11,r2
+ adwc r6,r3
+ movl r2,(r9)+
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r1,r11
+ addl2 r3,r10
+ adwc r6,r11
+ movl r10,(r9)+
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/gmp/mpn/vax/rshift.asm b/gmp/mpn/vax/rshift.asm
deleted file mode 100644
index 00b2daac01..0000000000
--- a/gmp/mpn/vax/rshift.asm
+++ /dev/null
@@ -1,57 +0,0 @@
-dnl VAX mpn_rshift -- right shift.
-
-dnl Copyright 1999-2001, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_rshift)
- .word 0x1c0
- movl 4(ap), r7
- movl 8(ap), r6
- movl 12(ap), r1
- movl 16(ap), r8
-
- movl (r6)+, r2
- subl3 r8, $32, r8
- ashl r8, r2, r0
- decl r1
- jeql L(end)
-
-L(top): movl (r6)+, r3
- ashq r8, r2, r4
- movl r5, (r7)+
- movl r3, r2
- sobgtr r1, L(top)
-
-L(end): clrl r3
- ashq r8, r2, r4
- movl r5, (r7)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/rshift.s b/gmp/mpn/vax/rshift.s
new file mode 100644
index 0000000000..ae27208e2c
--- /dev/null
+++ b/gmp/mpn/vax/rshift.s
@@ -0,0 +1,54 @@
+# VAX mpn_rshift -- right shift.
+
+# Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# rptr (sp + 4)
+# sptr (sp + 8)
+# size (sp + 12)
+# cnt (sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers
+
+.text
+ .align 1
+.globl ___gmpn_rshift
+___gmpn_rshift:
+ .word 0x1c0
+ movl 4(ap),r7
+ movl 8(ap),r6
+ movl 12(ap),r1
+ movl 16(ap),r8
+
+ movl (r6)+,r2
+ subl3 r8,$32,r8
+ ashl r8,r2,r0
+ decl r1
+ jeql Lend
+
+Loop: movl (r6)+,r3
+ ashq r8,r2,r4
+ movl r5,(r7)+
+ movl r3,r2
+ sobgtr r1,Loop
+
+Lend: clrl r3
+ ashq r8,r2,r4
+ movl r5,(r7)
+ ret
diff --git a/gmp/mpn/vax/sub_n.asm b/gmp/mpn/vax/sub_n.asm
deleted file mode 100644
index 2844ef2cc1..0000000000
--- a/gmp/mpn/vax/sub_n.asm
+++ /dev/null
@@ -1,64 +0,0 @@
-dnl VAX mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
-dnl store difference in a third limb vector.
-
-dnl Copyright 1999, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_sub_n)
- .word 0x0
- movl 16(ap), r0
- movl 12(ap), r1
- movl 8(ap), r2
- movl 4(ap), r3
- mnegl r0, r5
- addl2 $3, r0
- ashl $-2, r0, r0 C unroll loop count
- bicl2 $-4, r5 C mask out low 2 bits
- movaq (r5)[r5], r5 C 9x
- jmp L(top)[r5]
-
-L(top): movl (r2)+, r4
- sbwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- sbwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- sbwc (r1)+, r4
- movl r4, (r3)+
- movl (r2)+, r4
- sbwc (r1)+, r4
- movl r4, (r3)+
- sobgtr r0, L(top)
-
- adwc r0, r0
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/sub_n.s b/gmp/mpn/vax/sub_n.s
new file mode 100644
index 0000000000..c9ad1ecfb8
--- /dev/null
+++ b/gmp/mpn/vax/sub_n.s
@@ -0,0 +1,59 @@
+# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# s2_ptr (sp + 12)
+# size (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+ .word 0x0
+ movl 16(ap),r0
+ movl 12(ap),r1
+ movl 8(ap),r2
+ movl 4(ap),r3
+ mnegl r0,r5
+ addl2 $3,r0
+ ashl $-2,r0,r0 # unroll loop count
+ bicl2 $-4,r5 # mask out low 2 bits
+ movaq (r5)[r5],r5 # 9x
+ jmp Loop(r5)
+
+Loop: movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ movl (r2)+,r4
+ sbwc (r1)+,r4
+ movl r4,(r3)+
+ sobgtr r0,Loop
+
+ adwc r0,r0
+ ret
diff --git a/gmp/mpn/vax/submul_1.asm b/gmp/mpn/vax/submul_1.asm
deleted file mode 100644
index 60d47fcd6f..0000000000
--- a/gmp/mpn/vax/submul_1.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl VAX mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
-dnl result from a second limb vector.
-
-dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ASM_START()
-PROLOGUE(mpn_submul_1)
- .word 0xfc0
- movl 12(ap), r4
- movl 8(ap), r8
- movl 4(ap), r9
- clrl r3
- incl r4
- ashl $-1, r4, r7
- clrl r11
- movl 16(ap), r6
- jlss L(v0_big)
- jlbc r4, L(1)
-
-C Loop for v0 < 0x80000000
-L(tp1): movl (r8)+, r1
- jlss L(1n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc $0, r3
- subl2 r2, (r9)+
- adwc $0, r3
-L(1): movl (r8)+, r1
- jlss L(1n1)
-L(1p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc $0, r11
- subl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(1n0): emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r6, r3
- subl2 r2, (r9)+
- adwc $0, r3
- movl (r8)+, r1
- jgeq L(1p1)
-L(1n1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r6, r11
- subl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp1)
- movl r11, r0
- ret
-
-L(v0_big):
- jlbc r4, L(2)
-
-C Loop for v0 >= 0x80000000
-L(tp2): movl (r8)+, r1
- jlss L(2n0)
- emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r1, r3
- subl2 r2, (r9)+
- adwc $0, r3
-L(2): movl (r8)+, r1
- jlss L(2n1)
-L(2p1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r1, r11
- subl2 r10, (r9)+
- adwc $0, r11
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-
-L(2n0): emul r1, r6, $0, r2
- addl2 r11, r2
- adwc r6, r3
- subl2 r2, (r9)+
- adwc r1, r3
- movl (r8)+, r1
- jgeq L(2p1)
-L(2n1): emul r1, r6, $0, r10
- addl2 r3, r10
- adwc r6, r11
- subl2 r10, (r9)+
- adwc r1, r11
-
- sobgtr r7, L(tp2)
- movl r11, r0
- ret
-EPILOGUE()
diff --git a/gmp/mpn/vax/submul_1.s b/gmp/mpn/vax/submul_1.s
new file mode 100644
index 0000000000..ad0ddbbacb
--- /dev/null
+++ b/gmp/mpn/vax/submul_1.s
@@ -0,0 +1,124 @@
+# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+# INPUT PARAMETERS
+# res_ptr (sp + 4)
+# s1_ptr (sp + 8)
+# size (sp + 12)
+# s2_limb (sp + 16)
+
+.text
+ .align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+ .word 0xfc0
+ movl 12(ap),r4
+ movl 8(ap),r8
+ movl 4(ap),r9
+ movl 16(ap),r6
+ jlss s2_big
+
+ clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L1
+ clrl r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl (r8)+,r1
+ jlss L1n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc $0,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+L1: movl (r8)+,r1
+ jlss L1n1
+L1p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc $0,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+L1n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+ movl (r8)+,r1
+ jgeq L1p1
+L1n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop1
+ movl r11,r0
+ ret
+
+
+s2_big: clrl r3
+ incl r4
+ ashl $-1,r4,r7
+ jlbc r4,L2
+ clrl r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl (r8)+,r1
+ jlss L2n0
+ emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r1,r3
+ subl2 r2,(r9)+
+ adwc $0,r3
+L2: movl (r8)+,r1
+ jlss L2n1
+L2p1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r1,r11
+ subl2 r10,(r9)+
+ adwc $0,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
+
+L2n0: emul r1,r6,$0,r2
+ addl2 r11,r2
+ adwc r6,r3
+ subl2 r2,(r9)+
+ adwc r1,r3
+ movl (r8)+,r1
+ jgeq L2p1
+L2n1: emul r1,r6,$0,r10
+ addl2 r3,r10
+ adwc r6,r11
+ subl2 r10,(r9)+
+ adwc r1,r11
+
+ sobgtr r7,Loop2
+ movl r11,r0
+ ret
diff --git a/gmp/mpn/x86/README b/gmp/mpn/x86/README
index 8d7ac9080d..883db227d2 100644
--- a/gmp/mpn/x86/README
+++ b/gmp/mpn/x86/README
@@ -1,30 +1,19 @@
-Copyright 1999-2002 Free Software Foundation, Inc.
+Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/aors_n.asm b/gmp/mpn/x86/aors_n.asm
index 5d359f59b6..c8969995c8 100644
--- a/gmp/mpn/x86/aors_n.asm
+++ b/gmp/mpn/x86/aors_n.asm
@@ -1,42 +1,32 @@
dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1992, 1994-1996, 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C P5 3.375
-C P6 3.125
-C K6 3.5
-C K7 2.25
-C P4 8.75
+C P5: 3.375
+C P6: 3.125
+C K6: 3.5
+C K7: 2.25
+C P4: 8.75
ifdef(`OPERATION_add_n',`
@@ -109,7 +99,7 @@ L(0a): leal (%eax,%eax,8),%eax
C possible to simplify.
pushl %ebp FRAME_pushl()
movl PARAM_CARRY,%ebp
- shrl %ebp C shift bit 0 into carry
+ shrl $1,%ebp C shift bit 0 into carry
popl %ebp FRAME_popl()
jmp *%eax C jump into loop
@@ -158,7 +148,7 @@ L(0b): leal (%eax,%eax,8),%eax
L(oopgo):
pushl %ebp FRAME_pushl()
movl PARAM_CARRY,%ebp
- shrl %ebp C shift bit 0 into carry
+ shrl $1,%ebp C shift bit 0 into carry
popl %ebp FRAME_popl()
ALIGN(16)
diff --git a/gmp/mpn/x86/aorsmul_1.asm b/gmp/mpn/x86/aorsmul_1.asm
index 54a8905441..b4db427657 100644
--- a/gmp/mpn/x86/aorsmul_1.asm
+++ b/gmp/mpn/x86/aorsmul_1.asm
@@ -1,51 +1,40 @@
dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
dnl limb and add the result to a second limb vector.
-dnl Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 1992, 1994, 1997, 1999, 2000, 2001, 2002, 2005 Free Software
+dnl Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5 14.75
-C P6 model 0-8,10-12 7.5
-C P6 model 9 (Banias) 6.7
-C P6 model 13 (Dothan) 6.75
-C P4 model 0 (Willamette) 24.0
-C P4 model 1 (?) 24.0
-C P4 model 2 (Northwood) 24.0
+
+C cycles/limb
+C P5: 14.75
+C P6 model 0-8,10-12) 7.5
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 6.75
+C P4 model 0 (Willamette) 24.0
+C P4 model 1 (?) 24.0
+C P4 model 2 (Northwood) 24.0
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C Intel Atom
-C AMD K6 12.5
-C AMD K7 5.25
-C AMD K8
-C AMD K10
+C K6: 12.5
+C K7: 5.25
+C K8:
ifdef(`OPERATION_addmul_1',`
diff --git a/gmp/mpn/x86/atom/aorrlsh1_n.asm b/gmp/mpn/x86/atom/aorrlsh1_n.asm
deleted file mode 100644
index cd1a650022..0000000000
--- a/gmp/mpn/x86/atom/aorrlsh1_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 31)
-
-ifdef(`OPERATION_addlsh1_n', `
- define(M4_inst, adc)
- define(M4_opp, sub)
- define(M4_function, mpn_addlsh1_n)
- define(M4_function_c, mpn_addlsh1_nc)
-',`ifdef(`OPERATION_rsblsh1_n', `
- define(M4_inst, sbb)
- define(M4_opp, add)
- define(M4_function, mpn_rsblsh1_n)
- define(M4_function_c, mpn_rsblsh1_nc)
-',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
-
-include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86/atom/aorrlsh2_n.asm b/gmp/mpn/x86/atom/aorrlsh2_n.asm
deleted file mode 100644
index 10f4419de9..0000000000
--- a/gmp/mpn/x86/atom/aorrlsh2_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[]
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 30)
-
-ifdef(`OPERATION_addlsh2_n', `
- define(M4_inst, adcl)
- define(M4_opp, subl)
- define(M4_function, mpn_addlsh2_n)
- define(M4_function_c, mpn_addlsh2_nc)
-',`ifdef(`OPERATION_rsblsh2_n', `
- define(M4_inst, sbbl)
- define(M4_opp, addl)
- define(M4_function, mpn_rsblsh2_n)
- define(M4_function_c, mpn_rsblsh2_nc)
-',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc)
-
-include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86/atom/aorrlshC_n.asm b/gmp/mpn/x86/atom/aorrlshC_n.asm
deleted file mode 100644
index 71cfe490d6..0000000000
--- a/gmp/mpn/x86/atom/aorrlshC_n.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-dnl Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_signed_limb_t carry);
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 6
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-defframe(PARAM_CORB, 20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_DBLD, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBP,`PARAM_DBLD')
-define(SAVE_VP,`PARAM_SRC')
-define(SAVE_UP,`PARAM_DST')
-
-define(M, eval(m4_lshift(1,LSH)))
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebx')
-
-ASM_START()
- TEXT
- ALIGN(8)
-
-PROLOGUE(M4_function_c)
-deflit(`FRAME',0)
- movl PARAM_CORB, %eax
- movl %eax, %edx
- shr $LSH, %edx
- andl $1, %edx
- M4_opp %edx, %eax
- jmp L(start_nc)
-EPILOGUE()
-
-PROLOGUE(M4_function)
-deflit(`FRAME',0)
-
- xor %eax, %eax
- xor %edx, %edx
-L(start_nc):
- push rp FRAME_pushl()
-
- mov PARAM_SIZE, %ecx C size
- mov PARAM_DST, rp
- mov up, SAVE_UP
- incl %ecx C size + 1
- mov PARAM_SRC, up
- mov vp, SAVE_VP
- shr %ecx C (size+1)\2
- mov PARAM_DBLD, vp
- mov %ebp, SAVE_EBP
- mov %ecx, VAR_COUNT
- jnc L(entry) C size odd
-
- shr %edx C size even
- mov (vp), %ecx
- lea 4(vp), vp
- lea (%eax,%ecx,M), %edx
- mov %ecx, %eax
- lea -4(up), up
- lea -4(rp), rp
- jmp L(enteven)
-
- ALIGN(16)
-L(oop):
- lea (%eax,%ecx,M), %ebp
- shr $RSH, %ecx
- mov 4(vp), %eax
- shr %edx
- lea 8(vp), vp
- M4_inst (up), %ebp
- lea (%ecx,%eax,M), %edx
- mov %ebp, (rp)
-L(enteven):
- M4_inst 4(up), %edx
- lea 8(up), up
- mov %edx, 4(rp)
- adc %edx, %edx
- shr $RSH, %eax
- lea 8(rp), rp
-L(entry):
- mov (vp), %ecx
- decl VAR_COUNT
- jnz L(oop)
-
- lea (%eax,%ecx,M), %ebp
- shr $RSH, %ecx
- shr %edx
- mov SAVE_VP, vp
- M4_inst (up), %ebp
- mov %ecx, %eax
- mov SAVE_UP, up
- M4_inst $0, %eax
- mov %ebp, (rp)
- mov SAVE_EBP, %ebp
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-
-ASM_END()
diff --git a/gmp/mpn/x86/atom/aors_n.asm b/gmp/mpn/x86/atom/aors_n.asm
deleted file mode 100644
index 45ec287c3a..0000000000
--- a/gmp/mpn/x86/atom/aors_n.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[].
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 3
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-ifdef(`OPERATION_add_n', `
- define(M4_inst, adcl)
- define(M4_function_n, mpn_add_n)
- define(M4_function_nc, mpn_add_nc)
- define(M4_description, add)
-',`ifdef(`OPERATION_sub_n', `
- define(M4_inst, sbbl)
- define(M4_function_n, mpn_sub_n)
- define(M4_function_nc, mpn_sub_nc)
- define(M4_description, subtract)
-',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-C
-C Calculate src1,size M4_description src2,size, and store the result in
-C dst,size. The return value is the carry bit from the top of the result (1
-C or 0).
-C
-C The _nc version accepts 1 or 0 for an initial carry into the low limb of
-C the calculation. Note values other than 1 or 0 here will lead to garbage
-C results.
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC2, 12)
-defframe(PARAM_SRC1, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_RP,`PARAM_SIZE')
-define(SAVE_VP,`PARAM_SRC1')
-define(SAVE_UP,`PARAM_DST')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebx')
-define(`cy', `%ecx')
-define(`r1', `%ecx')
-define(`r2', `%edx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME',0)
-
-PROLOGUE(M4_function_n)
- xor cy, cy C carry
-L(start):
- mov PARAM_SIZE, %eax C size
- mov rp, SAVE_RP
- mov PARAM_DST, rp
- mov up, SAVE_UP
- mov PARAM_SRC1, up
- shr %eax C size >> 1
- mov vp, SAVE_VP
- mov PARAM_SRC2, vp
- jz L(one) C size == 1
- jc L(three) C size % 2 == 1
-
- shr cy
- mov (up), r2
- lea 4(up), up
- lea 4(vp), vp
- lea -4(rp), rp
- jmp L(entry)
-L(one):
- shr cy
- mov (up), r1
- jmp L(end)
-L(three):
- shr cy
- mov (up), r1
-
- ALIGN(16)
-L(oop):
- M4_inst (vp), r1
- lea 8(up), up
- mov -4(up), r2
- lea 8(vp), vp
- mov r1, (rp)
-L(entry):
- M4_inst -4(vp), r2
- lea 8(rp), rp
- dec %eax
- mov (up), r1
- mov r2, -4(rp)
- jnz L(oop)
-
-L(end): C %eax is zero here
- mov SAVE_UP, up
- M4_inst (vp), r1
- mov SAVE_VP, vp
- mov r1, (rp)
- adc %eax, %eax
- mov SAVE_RP, rp
- ret
-EPILOGUE()
-
-PROLOGUE(M4_function_nc)
- mov PARAM_CARRY, cy C carry
- jmp L(start)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/aorslshC_n.asm b/gmp/mpn/x86/atom/aorslshC_n.asm
deleted file mode 100644
index 75ace65e51..0000000000
--- a/gmp/mpn/x86/atom/aorslshC_n.asm
+++ /dev/null
@@ -1,247 +0,0 @@
-dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t carry);
-C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
-C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_signed_limb_t borrow);
-
-defframe(PARAM_CORB, 16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size,);
-C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size,);
-C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t borrow);
-
-C if src1 == dst, _ip1 is used
-
-C cycles/limb
-C dst!=src1,src2 dst==src1
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 7 6
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-defframe(GPARAM_CORB, 20)
-defframe(GPARAM_SIZE, 16)
-defframe(GPARAM_SRC2, 12)
-
-dnl re-use parameter space
-define(SAVE_EBP,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_UP,`PARAM_DST')
-
-define(M, eval(m4_lshift(1,LSH)))
-define(`rp', `%edi')
-define(`up', `%esi')
-
-ASM_START()
- TEXT
- ALIGN(8)
-
-PROLOGUE(M4_ip_function_c)
-deflit(`FRAME',0)
- movl PARAM_CORB, %ecx
- movl %ecx, %edx
- shr $LSH, %edx
- andl $1, %edx
- M4_opp %edx, %ecx
- jmp L(start_nc)
-EPILOGUE()
-
-PROLOGUE(M4_ip_function)
-deflit(`FRAME',0)
-
- xor %ecx, %ecx
- xor %edx, %edx
-L(start_nc):
- push rp FRAME_pushl()
- mov PARAM_DST, rp
- mov up, SAVE_UP
- mov PARAM_SRC, up
- mov %ebx, SAVE_EBX
- mov PARAM_SIZE, %ebx C size
-L(inplace):
- incl %ebx C size + 1
- shr %ebx C (size+1)\2
- mov %ebp, SAVE_EBP
- jnc L(entry) C size odd
-
- add %edx, %edx C size even
- mov %ecx, %ebp
- mov (up), %ecx
- lea -4(rp), rp
- lea (%ebp,%ecx,M), %eax
- lea 4(up), up
- jmp L(enteven)
-
- ALIGN(16)
-L(oop):
- lea (%ecx,%eax,M), %ebp
- shr $RSH, %eax
- mov 4(up), %ecx
- add %edx, %edx
- lea 8(up), up
- M4_inst %ebp, (rp)
- lea (%eax,%ecx,M), %eax
-
-L(enteven):
- M4_inst %eax, 4(rp)
- lea 8(rp), rp
-
- sbb %edx, %edx
- shr $RSH, %ecx
-
-L(entry):
- mov (up), %eax
- decl %ebx
- jnz L(oop)
-
- lea (%ecx,%eax,M), %ebp
- shr $RSH, %eax
- shr %edx
- M4_inst %ebp, (rp)
- mov SAVE_UP, up
- adc $0, %eax
- mov SAVE_EBP, %ebp
- mov SAVE_EBX, %ebx
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-
-PROLOGUE(M4_function_c)
-deflit(`FRAME',0)
- movl GPARAM_CORB, %ecx
- movl %ecx, %edx
- shr $LSH, %edx
- andl $1, %edx
- M4_opp %edx, %ecx
- jmp L(generic_nc)
-EPILOGUE()
-
-PROLOGUE(M4_function)
-deflit(`FRAME',0)
-
- xor %ecx, %ecx
- xor %edx, %edx
-L(generic_nc):
- push rp FRAME_pushl()
- mov PARAM_DST, rp
- mov up, SAVE_UP
- mov PARAM_SRC, up
- cmp rp, up
- mov %ebx, SAVE_EBX
- jne L(general)
- mov GPARAM_SIZE, %ebx C size
- mov GPARAM_SRC2, up
- jmp L(inplace)
-
-L(general):
- mov GPARAM_SIZE, %eax C size
- mov %ebx, SAVE_EBX
- incl %eax C size + 1
- mov up, %ebx C vp
- mov GPARAM_SRC2, up C up
- shr %eax C (size+1)\2
- mov %ebp, SAVE_EBP
- mov %eax, GPARAM_SIZE
- jnc L(entry2) C size odd
-
- add %edx, %edx C size even
- mov %ecx, %ebp
- mov (up), %ecx
- lea -4(rp), rp
- lea -4(%ebx), %ebx
- lea (%ebp,%ecx,M), %eax
- lea 4(up), up
- jmp L(enteven2)
-
- ALIGN(16)
-L(oop2):
- lea (%ecx,%eax,M), %ebp
- shr $RSH, %eax
- mov 4(up), %ecx
- add %edx, %edx
- lea 8(up), up
- mov (%ebx), %edx
- M4_inst %ebp, %edx
- lea (%eax,%ecx,M), %eax
- mov %edx, (rp)
-L(enteven2):
- mov 4(%ebx), %edx
- lea 8(%ebx), %ebx
- M4_inst %eax, %edx
- mov %edx, 4(rp)
- sbb %edx, %edx
- shr $RSH, %ecx
- lea 8(rp), rp
-L(entry2):
- mov (up), %eax
- decl GPARAM_SIZE
- jnz L(oop2)
-
- lea (%ecx,%eax,M), %ebp
- shr $RSH, %eax
- shr %edx
- mov (%ebx), %edx
- M4_inst %ebp, %edx
- mov %edx, (rp)
- mov SAVE_UP, up
- adc $0, %eax
- mov SAVE_EBP, %ebp
- mov SAVE_EBX, %ebx
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-
-ASM_END()
diff --git a/gmp/mpn/x86/atom/bdiv_q_1.asm b/gmp/mpn/x86/atom/bdiv_q_1.asm
deleted file mode 100644
index 31e908ec44..0000000000
--- a/gmp/mpn/x86/atom/bdiv_q_1.asm
+++ /dev/null
@@ -1,35 +0,0 @@
-dnl Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel
-dnl division by 1-limb divisor, returning quotient only.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
-include_mpn(`x86/pentium/bdiv_q_1.asm')
diff --git a/gmp/mpn/x86/atom/cnd_add_n.asm b/gmp/mpn/x86/atom/cnd_add_n.asm
deleted file mode 100644
index 50bf2ad64b..0000000000
--- a/gmp/mpn/x86/atom/cnd_add_n.asm
+++ /dev/null
@@ -1,113 +0,0 @@
-dnl X86 mpn_cnd_add_n optimised for Intel Atom.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3-4 (Prescott) ?
-C Intel atom 4.67
-C AMD K6 ?
-C AMD K7 ?
-C AMD K8 ?
-
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebp')
-define(`n', `%ecx')
-define(`cnd', `20(%esp)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_cnd_add_n)
- push %edi
- push %esi
- push %ebx
- push %ebp
-
- mov cnd, %eax C make cnd into a mask (1)
- mov 24(%esp), rp
- neg %eax C make cnd into a mask (1)
- mov 28(%esp), up
- sbb %eax, %eax C make cnd into a mask (1)
- mov 32(%esp), vp
- mov %eax, cnd C make cnd into a mask (1)
- mov 36(%esp), n
-
- xor %edx, %edx
-
- shr $1, n
- jnc L(top)
-
- mov 0(vp), %eax
- and cnd, %eax
- lea 4(vp), vp
- add 0(up), %eax
- lea 4(rp), rp
- lea 4(up), up
- sbb %edx, %edx
- mov %eax, -4(rp)
- inc n
- dec n
- je L(end)
-
-L(top): sbb %edx, %edx
- mov 0(vp), %eax
- and cnd, %eax
- lea 8(vp), vp
- lea 8(rp), rp
- mov -4(vp), %ebx
- and cnd, %ebx
- add %edx, %edx
- adc 0(up), %eax
- lea 8(up), up
- mov %eax, -8(rp)
- adc -4(up), %ebx
- dec n
- mov %ebx, -4(rp)
- jne L(top)
-
-L(end): mov $0, %eax
- adc %eax, %eax
-
- pop %ebp
- pop %ebx
- pop %esi
- pop %edi
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/cnd_sub_n.asm b/gmp/mpn/x86/atom/cnd_sub_n.asm
deleted file mode 100644
index 221bedca37..0000000000
--- a/gmp/mpn/x86/atom/cnd_sub_n.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl X86 mpn_cnd_sub_n optimised for Intel Atom.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3-4 (Prescott) ?
-C Intel atom 5.67
-C AMD K6 ?
-C AMD K7 ?
-C AMD K8 ?
-
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebp')
-define(`n', `%ecx')
-define(`cnd', `20(%esp)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_cnd_sub_n)
- push %edi
- push %esi
- push %ebx
- push %ebp
-
- mov cnd, %eax C make cnd into a mask (1)
- mov 24(%esp), rp
- neg %eax C make cnd into a mask (1)
- mov 28(%esp), up
- sbb %eax, %eax C make cnd into a mask (1)
- mov 32(%esp), vp
- mov %eax, cnd C make cnd into a mask (1)
- mov 36(%esp), n
-
- xor %edx, %edx
-
- inc n
- shr n
- jnc L(ent)
-
- mov 0(vp), %eax
- and cnd, %eax
- lea 4(vp), vp
- mov 0(up), %edx
- sub %eax, %edx
- lea 4(rp), rp
- lea 4(up), up
- mov %edx, -4(rp)
- sbb %edx, %edx C save cy
-
-L(ent): mov 0(vp), %ebx
- and cnd, %ebx
- add %edx, %edx C restore cy
- mov 0(up), %edx
- dec n
- je L(end)
-
-L(top): sbb %ebx, %edx
- mov 4(vp), %eax
- mov %edx, 0(rp)
- sbb %edx, %edx C save cy
- mov 8(vp), %ebx
- lea 8(up), up
- and cnd, %ebx
- and cnd, %eax
- add %edx, %edx C restore cy
- mov -4(up), %edx
- lea 8(rp), rp
- sbb %eax, %edx
- mov %edx, -4(rp)
- dec n
- mov 0(up), %edx
- lea 8(vp), vp
- jne L(top)
-
-L(end): sbb %ebx, %edx
- mov %edx, 0(rp)
-
- mov $0, %eax
- adc %eax, %eax
-
- pop %ebp
- pop %ebx
- pop %esi
- pop %edi
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/dive_1.asm b/gmp/mpn/x86/atom/dive_1.asm
deleted file mode 100644
index 71036a15a4..0000000000
--- a/gmp/mpn/x86/atom/dive_1.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_divexact_1)
-include_mpn(`x86/pentium/dive_1.asm')
diff --git a/gmp/mpn/x86/atom/gmp-mparam.h b/gmp/mpn/x86/atom/gmp-mparam.h
deleted file mode 100644
index 45df12806c..0000000000
--- a/gmp/mpn/x86/atom/gmp-mparam.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1667 MHz Pineview (Atom D510) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-14, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 5
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 4
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 31
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 178
-#define MUL_TOOM6H_THRESHOLD 270
-#define MUL_TOOM8H_THRESHOLD 399
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 127
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 105
-#define SQR_TOOM4_THRESHOLD 178
-#define SQR_TOOM6_THRESHOLD 303
-#define SQR_TOOM8_THRESHOLD 527
-
-#define MULMID_TOOM42_THRESHOLD 54
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 18
-
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
- { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \
- { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
- { 223,10}, { 447,12}, { 127,11}, { 255,10}, \
- { 543,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 735,12}, { 383,11}, { 831,12}, \
- { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1151,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 831,11}, { 1663,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1663,13}, { 895,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
- { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
- { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
- { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 150
-#define MUL_FFT_THRESHOLD 4544
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255,10}, { 79, 9}, { 159, 8}, { 319,10}, \
- { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \
- { 287, 8}, { 575, 9}, { 303, 8}, { 607,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
- { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \
- { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 415,11}, { 223,10}, { 447,12}, { 127,11}, \
- { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
- { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
- { 831,12}, { 447,11}, { 959,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 831,11}, { 1663,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
- { 1471,13}, { 767,12}, { 1663,13}, { 895,12}, \
- { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \
- { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \
- { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \
- { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 151
-#define SQR_FFT_THRESHOLD 2880
-
-#define MULLO_BASECASE_THRESHOLD 6
-#define MULLO_DC_THRESHOLD 48
-#define MULLO_MUL_N_THRESHOLD 8907
-
-#define DC_DIV_QR_THRESHOLD 59
-#define DC_DIVAPPR_Q_THRESHOLD 250
-#define DC_BDIV_QR_THRESHOLD 59
-#define DC_BDIV_Q_THRESHOLD 169
-
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 246
-#define INV_APPR_THRESHOLD 246
-
-#define BINV_NEWTON_THRESHOLD 276
-#define REDC_1_TO_REDC_N_THRESHOLD 67
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1442
-#define MUPI_DIV_QR_THRESHOLD 114
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1334
-
-#define POWM_SEC_TABLE 1,22,98,416,1378
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 133
-#define HGCD_APPR_THRESHOLD 169
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 460
-#define GCDEXT_DC_THRESHOLD 342
-#define JACOBI_BASE_METHOD 3
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 321
-#define SET_STR_PRECOMPUTE_THRESHOLD 1099
-
-#define FAC_DSC_THRESHOLD 198
-#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/atom/logops_n.asm b/gmp/mpn/x86/atom/logops_n.asm
deleted file mode 100644
index 3cb6d7310c..0000000000
--- a/gmp/mpn/x86/atom/logops_n.asm
+++ /dev/null
@@ -1,151 +0,0 @@
-dnl Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C op nop opn
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 3 3.5 3.5
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-define(M4_choose_op,
-`ifdef(`OPERATION_$1',`
-define(`M4_function', `mpn_$1')
-define(`M4_want_pre', `$4')
-define(`M4_inst', `$3')
-define(`M4_want_post',`$2')
-')')
-define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
-define(M4post,`ifelse(M4_want_post,yes,`$1')')
-
-M4_choose_op( and_n, , andl, )
-M4_choose_op( andn_n, , andl, yes)
-M4_choose_op( nand_n, yes, andl, )
-M4_choose_op( ior_n, , orl, )
-M4_choose_op( iorn_n, , orl, yes)
-M4_choose_op( nior_n, yes, orl, )
-M4_choose_op( xor_n, , xorl, )
-M4_choose_op( xnor_n, yes, xorl, )
-
-ifdef(`M4_function',,
-`m4_error(`Unrecognised or undefined OPERATION symbol
-')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
-C
-
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC1, 12)
-defframe(PARAM_SRC2, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_RP,`PARAM_SIZE')
-define(SAVE_VP,`PARAM_SRC1')
-define(SAVE_UP,`PARAM_DST')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebx')
-define(`cnt', `%eax')
-define(`r1', `%ecx')
-define(`r2', `%edx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME',0)
-
-PROLOGUE(M4_function)
- mov PARAM_SIZE, cnt C size
- mov rp, SAVE_RP
- mov PARAM_DST, rp
- mov up, SAVE_UP
- mov PARAM_SRC1, up
- shr cnt C size >> 1
- mov vp, SAVE_VP
- mov PARAM_SRC2, vp
- mov (up), r1
- jz L(end) C size == 1
- jnc L(even) C size % 2 == 0
-
- ALIGN(16)
-L(oop):
-M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
- M4_inst (vp), r1
- lea 8(up), up
- mov -4(up), r2
-M4post(` notl_or_xorl_GMP_NUMB_MASK(r1)')
- lea 8(vp), vp
- mov r1, (rp)
-L(entry):
-M4pre(` notl_or_xorl_GMP_NUMB_MASK(r2)')
- M4_inst -4(vp), r2
- lea 8(rp), rp
-M4post(` notl_or_xorl_GMP_NUMB_MASK(r2)')
- dec cnt
- mov (up), r1
- mov r2, -4(rp)
- jnz L(oop)
-
-L(end):
-M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
- mov SAVE_UP, up
- M4_inst (vp), r1
-M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
- mov SAVE_VP, vp
- mov r1, (rp)
- mov SAVE_RP, rp
- ret
-
-L(even):
- mov r1, r2
- lea 4(up), up
- lea 4(vp), vp
- lea -4(rp), rp
- jmp L(entry)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/lshift.asm b/gmp/mpn/x86/atom/lshift.asm
deleted file mode 100644
index f2c70dd3e8..0000000000
--- a/gmp/mpn/x86/atom/lshift.asm
+++ /dev/null
@@ -1,218 +0,0 @@
-dnl Intel Atom mpn_lshift -- mpn left shift.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned cnt);
-
-C cycles/limb
-C cnt!=1 cnt==1
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 5 2.5
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-defframe(PARAM_CNT, 16)
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_UP,`PARAM_CNT')
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`cnt', `%ecx')
-
-ASM_START()
- TEXT
- ALIGN(8)
-deflit(`FRAME',0)
-PROLOGUE(mpn_lshift)
- mov PARAM_CNT, cnt
- mov PARAM_SIZE, %edx
- mov up, SAVE_UP
- mov PARAM_SRC, up
- push rp FRAME_pushl()
- mov PARAM_DST, rp
-
-C We can use faster code for shift-by-1 under certain conditions.
- cmp $1,cnt
- jne L(normal)
- cmpl rp, up
- jnc L(special) C jump if s_ptr + 1 >= res_ptr
- leal (up,%edx,4),%eax
- cmpl %eax,rp
- jnc L(special) C jump if res_ptr >= s_ptr + size
-
-L(normal):
- lea -4(up,%edx,4), up
- mov %ebx, SAVE_EBX
- lea -4(rp,%edx,4), rp
-
- shr %edx
- mov (up), %eax
- mov %edx, VAR_COUNT
- jnc L(evn)
-
- mov %eax, %ebx
- shl %cl, %ebx
- neg cnt
- shr %cl, %eax
- test %edx, %edx
- jnz L(gt1)
- mov %ebx, (rp)
- jmp L(quit)
-
-L(gt1): mov %ebp, SAVE_EBP
- push %eax
- mov -4(up), %eax
- mov %eax, %ebp
- shr %cl, %eax
- jmp L(lo1)
-
-L(evn): mov %ebp, SAVE_EBP
- neg cnt
- mov %eax, %ebp
- mov -4(up), %edx
- shr %cl, %eax
- mov %edx, %ebx
- shr %cl, %edx
- neg cnt
- decl VAR_COUNT
- lea 4(rp), rp
- lea -4(up), up
- jz L(end)
- push %eax FRAME_pushl()
-
- ALIGN(8)
-L(top): shl %cl, %ebp
- or %ebp, %edx
- shl %cl, %ebx
- neg cnt
- mov -4(up), %eax
- mov %eax, %ebp
- mov %edx, -4(rp)
- shr %cl, %eax
- lea -8(rp), rp
-L(lo1): mov -8(up), %edx
- or %ebx, %eax
- mov %edx, %ebx
- shr %cl, %edx
- lea -8(up), up
- neg cnt
- mov %eax, (rp)
- decl VAR_COUNT
- jg L(top)
-
- pop %eax FRAME_popl()
-L(end):
- shl %cl, %ebp
- shl %cl, %ebx
- or %ebp, %edx
- mov SAVE_EBP, %ebp
- mov %edx, -4(rp)
- mov %ebx, -8(rp)
-
-L(quit):
- mov SAVE_UP, up
- mov SAVE_EBX, %ebx
- pop rp FRAME_popl()
- ret
-
-L(special):
-deflit(`FRAME',4)
- lea 3(%edx), %eax C size + 3
- dec %edx C size - 1
- mov (up), %ecx
- shr $2, %eax C (size + 3) / 4
- and $3, %edx C (size - 1) % 4
- jz L(goloop) C jmp if size == 1 (mod 4)
- shr %edx
- jnc L(odd) C jum if size == 3 (mod 4)
-
- add %ecx, %ecx
- lea 4(up), up
- mov %ecx, (rp)
- mov (up), %ecx
- lea 4(rp), rp
-
- dec %edx
- jnz L(goloop) C jump if size == 0 (mod 4)
-L(odd): lea -8(up), up
- lea -8(rp), rp
- jmp L(sentry) C reached if size == 2 or 3 (mod 4)
-
-L(sloop):
- adc %ecx, %ecx
- mov 4(up), %edx
- mov %ecx, (rp)
- adc %edx, %edx
- mov 8(up), %ecx
- mov %edx, 4(rp)
-L(sentry):
- adc %ecx, %ecx
- mov 12(up), %edx
- mov %ecx, 8(rp)
- adc %edx, %edx
- lea 16(up), up
- mov %edx, 12(rp)
- lea 16(rp), rp
- mov (up), %ecx
-L(goloop):
- decl %eax
- jnz L(sloop)
-
-L(squit):
- adc %ecx, %ecx
- mov %ecx, (rp)
- adc %eax, %eax
-
- mov SAVE_UP, up
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/lshiftc.asm b/gmp/mpn/x86/atom/lshiftc.asm
deleted file mode 100644
index 5be53ed19d..0000000000
--- a/gmp/mpn/x86/atom/lshiftc.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl Intel Atom mpn_lshiftc -- mpn left shift with complement.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned cnt);
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 5.5
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-defframe(PARAM_CNT, 16)
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_UP,`PARAM_CNT')
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`cnt', `%ecx')
-
-ASM_START()
- TEXT
-
-PROLOGUE(mpn_lshiftc)
-deflit(`FRAME',0)
- mov PARAM_CNT, cnt
- mov PARAM_SIZE, %edx
- mov up, SAVE_UP
- mov PARAM_SRC, up
- push rp FRAME_pushl()
- mov PARAM_DST, rp
-
- lea -4(up,%edx,4), up
- mov %ebx, SAVE_EBX
- lea -4(rp,%edx,4), rp
-
- shr %edx
- mov (up), %eax
- mov %edx, VAR_COUNT
- jnc L(evn)
-
- mov %eax, %ebx
- shl %cl, %ebx
- neg cnt
- shr %cl, %eax
- test %edx, %edx
- jnz L(gt1)
- not %ebx
- mov %ebx, (rp)
- jmp L(quit)
-
-L(gt1): mov %ebp, SAVE_EBP
- push %eax
- mov -4(up), %eax
- mov %eax, %ebp
- shr %cl, %eax
- jmp L(lo1)
-
-L(evn): mov %ebp, SAVE_EBP
- neg cnt
- mov %eax, %ebp
- mov -4(up), %edx
- shr %cl, %eax
- mov %edx, %ebx
- shr %cl, %edx
- neg cnt
- decl VAR_COUNT
- lea 4(rp), rp
- lea -4(up), up
- jz L(end)
- push %eax FRAME_pushl()
-
-L(top): shl %cl, %ebp
- or %ebp, %edx
- shl %cl, %ebx
- neg cnt
- not %edx
- mov -4(up), %eax
- mov %eax, %ebp
- mov %edx, -4(rp)
- shr %cl, %eax
- lea -8(rp), rp
-L(lo1): mov -8(up), %edx
- or %ebx, %eax
- mov %edx, %ebx
- shr %cl, %edx
- not %eax
- lea -8(up), up
- neg cnt
- mov %eax, (rp)
- decl VAR_COUNT
- jg L(top)
-
- pop %eax FRAME_popl()
-L(end):
- shl %cl, %ebp
- shl %cl, %ebx
- or %ebp, %edx
- mov SAVE_EBP, %ebp
- not %edx
- not %ebx
- mov %edx, -4(rp)
- mov %ebx, -8(rp)
-
-L(quit):
- mov SAVE_UP, up
- mov SAVE_EBX, %ebx
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/mmx/copyd.asm b/gmp/mpn/x86/atom/mmx/copyd.asm
deleted file mode 100644
index b80fb033fe..0000000000
--- a/gmp/mpn/x86/atom/mmx/copyd.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_copyd -- copy limb vector, decrementing.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86/k7/mmx/copyd.asm')
diff --git a/gmp/mpn/x86/atom/mmx/copyi.asm b/gmp/mpn/x86/atom/mmx/copyi.asm
deleted file mode 100644
index 49b6b8d662..0000000000
--- a/gmp/mpn/x86/atom/mmx/copyi.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_copyi -- copy limb vector, incrementing.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86/k7/mmx/copyi.asm')
diff --git a/gmp/mpn/x86/atom/mmx/hamdist.asm b/gmp/mpn/x86/atom/mmx/hamdist.asm
deleted file mode 100644
index 3fe8253240..0000000000
--- a/gmp/mpn/x86/atom/mmx/hamdist.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_hamdist -- hamming distance.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/gmp/mpn/x86/atom/mod_34lsub1.asm b/gmp/mpn/x86/atom/mod_34lsub1.asm
deleted file mode 100644
index 6d57ba385d..0000000000
--- a/gmp/mpn/x86/atom/mod_34lsub1.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_mod_34lsub1)
-include_mpn(`x86/p6/mod_34lsub1.asm')
diff --git a/gmp/mpn/x86/atom/mode1o.asm b/gmp/mpn/x86/atom/mode1o.asm
deleted file mode 100644
index c9ee6bd2db..0000000000
--- a/gmp/mpn/x86/atom/mode1o.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_modexact_1_odd -- exact division style remainder.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd)
-include_mpn(`x86/pentium/mode1o.asm')
diff --git a/gmp/mpn/x86/atom/rshift.asm b/gmp/mpn/x86/atom/rshift.asm
deleted file mode 100644
index 1cb5dbefe9..0000000000
--- a/gmp/mpn/x86/atom/rshift.asm
+++ /dev/null
@@ -1,152 +0,0 @@
-dnl Intel Atom mpn_rshift -- mpn right shift.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Converted from AMD64 by Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned cnt);
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 5
-C AMD K6
-C AMD K7
-C AMD K8
-C AMD K10
-
-defframe(PARAM_CNT, 16)
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_UP,`PARAM_CNT')
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`cnt', `%ecx')
-
-ASM_START()
- TEXT
- ALIGN(8)
-deflit(`FRAME',0)
-PROLOGUE(mpn_rshift)
- mov PARAM_CNT, cnt
- mov PARAM_SIZE, %edx
- mov up, SAVE_UP
- mov PARAM_SRC, up
- push rp FRAME_pushl()
- mov PARAM_DST, rp
- mov %ebx, SAVE_EBX
-
- shr %edx
- mov (up), %eax
- mov %edx, VAR_COUNT
- jnc L(evn)
-
- mov %eax, %ebx
- shr %cl, %ebx
- neg cnt
- shl %cl, %eax
- test %edx, %edx
- jnz L(gt1)
- mov %ebx, (rp)
- jmp L(quit)
-
-L(gt1): mov %ebp, SAVE_EBP
- push %eax
- mov 4(up), %eax
- mov %eax, %ebp
- shl %cl, %eax
- jmp L(lo1)
-
-L(evn): mov %ebp, SAVE_EBP
- neg cnt
- mov %eax, %ebp
- mov 4(up), %edx
- shl %cl, %eax
- mov %edx, %ebx
- shl %cl, %edx
- neg cnt
- decl VAR_COUNT
- lea -4(rp), rp
- lea 4(up), up
- jz L(end)
- push %eax FRAME_pushl()
-
- ALIGN(8)
-L(top): shr %cl, %ebp
- or %ebp, %edx
- shr %cl, %ebx
- neg cnt
- mov 4(up), %eax
- mov %eax, %ebp
- mov %edx, 4(rp)
- shl %cl, %eax
- lea 8(rp), rp
-L(lo1): mov 8(up), %edx
- or %ebx, %eax
- mov %edx, %ebx
- shl %cl, %edx
- lea 8(up), up
- neg cnt
- mov %eax, (rp)
- decl VAR_COUNT
- jg L(top)
-
- pop %eax FRAME_popl()
-L(end):
- shr %cl, %ebp
- shr %cl, %ebx
- or %ebp, %edx
- mov SAVE_EBP, %ebp
- mov %edx, 4(rp)
- mov %ebx, 8(rp)
-
-L(quit):
- mov SAVE_UP, up
- mov SAVE_EBX, %ebx
- pop rp FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/sse2/aorsmul_1.asm b/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
deleted file mode 100644
index 969a14a919..0000000000
--- a/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
+++ /dev/null
@@ -1,174 +0,0 @@
-dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 8
-C AMD K6
-C AMD K7 -
-C AMD K8
-C AMD K10
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`n', `%ecx')
-
-ifdef(`OPERATION_addmul_1',`
- define(ADDSUB, add)
- define(func_1, mpn_addmul_1)
- define(func_1c, mpn_addmul_1c)')
-ifdef(`OPERATION_submul_1',`
- define(ADDSUB, sub)
- define(func_1, mpn_submul_1)
- define(func_1c, mpn_submul_1c)')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
- TEXT
- ALIGN(16)
-PROLOGUE(func_1)
- xor %edx, %edx
-L(ent): push %edi
- push %esi
- push %ebx
- mov 16(%esp), rp
- mov 20(%esp), up
- mov 24(%esp), n
- movd 28(%esp), %mm7
- test $1, n
- jz L(fi0or2)
- movd (up), %mm0
- pmuludq %mm7, %mm0
- shr $2, n
- jnc L(fi1)
-
-L(fi3): lea -8(up), up
- lea -8(rp), rp
- movd 12(up), %mm1
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- add $1, n C increment and clear carry
- jmp L(lo3)
-
-L(fi1): movd %mm0, %ebx
- jz L(wd1)
- movd 4(up), %mm1
- pmuludq %mm7, %mm1
- jmp L(lo1)
-
-L(fi0or2):
- movd (up), %mm1
- pmuludq %mm7, %mm1
- shr $2, n
- movd 4(up), %mm0
- jc L(fi2)
- lea -4(up), up
- lea -4(rp), rp
- movd %mm1, %eax
- pmuludq %mm7, %mm0
- jmp L(lo0)
-
-L(fi2): lea 4(up), up
- add $1, n C increment and clear carry
- movd %mm1, %eax
- lea -12(rp), rp
- jmp L(lo2)
-
-C ALIGN(16) C alignment seems irrelevant
-L(top): movd 4(up), %mm1
- adc $0, %edx
- ADDSUB %eax, 12(rp)
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea 16(rp), rp
-L(lo1): psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- ADDSUB %ebx, (rp)
-L(lo0): psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- movd %mm0, %ebx
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- ADDSUB %eax, 4(rp)
-L(lo3): psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- ADDSUB %ebx, 8(rp)
-L(lo2): psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- dec n
- jnz L(top)
-
-L(end): adc n, %edx C n is zero here
- ADDSUB %eax, 12(rp)
- movd %mm0, %ebx
- lea 16(rp), rp
-L(wd1): psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %eax
- adc n, %eax
- ADDSUB %ebx, (rp)
- emms
- adc n, %eax
- pop %ebx
- pop %esi
- pop %edi
- ret
-EPILOGUE()
-PROLOGUE(func_1c)
- mov 20(%esp), %edx C carry
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
deleted file mode 100644
index 782e914019..0000000000
--- a/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_bdiv_dbm1.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_bdiv_dbm1c)
-include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm')
diff --git a/gmp/mpn/x86/atom/sse2/divrem_1.asm b/gmp/mpn/x86/atom/sse2/divrem_1.asm
deleted file mode 100644
index f84709a22e..0000000000
--- a/gmp/mpn/x86/atom/sse2/divrem_1.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_divrem_1 -- mpn by limb division.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1)
-include_mpn(`x86/pentium4/sse2/divrem_1.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mod_1_1.asm b/gmp/mpn/x86/atom/sse2/mod_1_1.asm
deleted file mode 100644
index ae6581d9b6..0000000000
--- a/gmp/mpn/x86/atom/sse2/mod_1_1.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom/SSE2 mpn_mod_1_1.
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_mod_1_1p)
-include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mod_1_4.asm b/gmp/mpn/x86/atom/sse2/mod_1_4.asm
deleted file mode 100644
index 31faa3f0a3..0000000000
--- a/gmp/mpn/x86/atom/sse2/mod_1_4.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom/SSE2 mpn_mod_1_4.
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_mod_1s_4p)
-include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/gmp/mpn/x86/atom/sse2/mul_1.asm b/gmp/mpn/x86/atom/sse2/mul_1.asm
deleted file mode 100644
index aa3bb974bb..0000000000
--- a/gmp/mpn/x86/atom/sse2/mul_1.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl Intel Atom mpn_mul_1.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 7.5
-C AMD K6 -
-C AMD K7 -
-C AMD K8
-C AMD K10
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_MUL, 16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-define(`rp', `%edx')
-define(`up', `%esi')
-define(`n', `%ecx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME',0)
-
-PROLOGUE(mpn_mul_1c)
- movd PARAM_CARRY, %mm6 C carry
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(8) C for compact code
-PROLOGUE(mpn_mul_1)
- pxor %mm6, %mm6
-L(ent): push %esi FRAME_pushl()
- mov PARAM_SRC, up
- mov PARAM_SIZE, %eax C size
- movd PARAM_MUL, %mm7
- movd (up), %mm0
- mov %eax, n
- and $3, %eax
- pmuludq %mm7, %mm0
- mov PARAM_DST, rp
- jz L(lo0)
- cmp $2, %eax
- lea -16(up,%eax,4),up
- lea -16(rp,%eax,4),rp
- jc L(lo1)
- jz L(lo2)
- jmp L(lo3)
-
- ALIGN(16)
-L(top): movd (up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
-L(lo0): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
-L(lo3): paddq %mm0, %mm6
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 4(rp)
- psrlq $32, %mm6
-L(lo2): paddq %mm0, %mm6
- movd 12(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 8(rp)
- psrlq $32, %mm6
-L(lo1): paddq %mm0, %mm6
- sub $4, n
- movd %mm6, 12(rp)
- lea 16(up), up
- ja L(top)
-
- psrlq $32, %mm6
- movd %mm6, %eax
- emms
- pop %esi FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/atom/sse2/mul_basecase.asm b/gmp/mpn/x86/atom/sse2/mul_basecase.asm
deleted file mode 100644
index 97d3aeb5ad..0000000000
--- a/gmp/mpn/x86/atom/sse2/mul_basecase.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
-dnl a third limb vector.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO
-C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
-C 4 large loops into one; we could use it for the outer loop branch.
-C * Optimise code outside of inner loops.
-C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
-C outer each loop. ("Overlapping software pipelining")
-C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs
-C for inlined mul_1, allowing us to postpone all pushes.
-C * Perhaps write special code for vn <= un < M, for some small M.
-
-C void mpn_mul_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xn,
-C mp_srcptr yp, mp_size_t yn);
-C
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`un', `%ecx')
-define(`vp', `%ebp')
-define(`vn', `36(%esp)')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- push %edi
- push %esi
- push %ebx
- push %ebp
- mov 20(%esp), rp
- mov 24(%esp), up
- mov 28(%esp), un
- mov 32(%esp), vp
-
- movd (up), %mm0
- movd (vp), %mm7
- pmuludq %mm7, %mm0
- pxor %mm6, %mm6
-
- mov un, %eax
- and $3, %eax
- jz L(of0)
- cmp $2, %eax
- jc L(of1)
- jz L(of2)
-
-C ================================================================
- jmp L(m3)
- ALIGN(16)
-L(lm3): movd -4(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(m3): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 4(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- sub $4, un
- movd %mm6, 8(rp)
- lea 16(up), up
- ja L(lm3)
-
- psrlq $32, %mm6
- movd %mm6, 12(rp)
-
- decl vn
- jz L(done)
- lea -8(rp), rp
-
-L(ol3): mov 28(%esp), un
- neg un
- lea 4(vp), vp
- movd (vp), %mm7 C read next V limb
- mov 24(%esp), up
- lea 16(rp,un,4), rp
-
- movd (up), %mm0
- pmuludq %mm7, %mm0
- sar $2, un
- movd 4(up), %mm1
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea -8(up), up
- xor %edx, %edx C zero edx and CF
- jmp L(a3)
-
-L(la3): movd 4(up), %mm1
- adc $0, %edx
- add %eax, 12(rp)
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %ebx, (rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- movd %mm0, %ebx
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %eax, 4(rp)
-L(a3): psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %ebx, 8(rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- jnz L(la3)
-
- adc un, %edx C un is zero here
- add %eax, 12(rp)
- movd %mm0, %ebx
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %eax
- adc un, %eax
- add %ebx, 16(rp)
- adc un, %eax
- mov %eax, 20(rp)
-
- decl vn
- jnz L(ol3)
- jmp L(done)
-
-C ================================================================
- ALIGN(16)
-L(lm0): movd (up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
-L(of0): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 4(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 12(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- sub $4, un
- movd %mm6, 12(rp)
- lea 16(up), up
- ja L(lm0)
-
- psrlq $32, %mm6
- movd %mm6, 16(rp)
-
- decl vn
- jz L(done)
- lea -4(rp), rp
-
-L(ol0): mov 28(%esp), un
- neg un
- lea 4(vp), vp
- movd (vp), %mm7 C read next V limb
- mov 24(%esp), up
- lea 20(rp,un,4), rp
-
- movd (up), %mm1
- pmuludq %mm7, %mm1
- sar $2, un
- movd 4(up), %mm0
- lea -4(up), up
- movd %mm1, %eax
- pmuludq %mm7, %mm0
- xor %edx, %edx C zero edx and CF
- jmp L(a0)
-
-L(la0): movd 4(up), %mm1
- adc $0, %edx
- add %eax, 12(rp)
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %ebx, (rp)
-L(a0): psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- movd %mm0, %ebx
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %eax, 4(rp)
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %ebx, 8(rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- jnz L(la0)
-
- adc un, %edx C un is zero here
- add %eax, 12(rp)
- movd %mm0, %ebx
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %eax
- adc un, %eax
- add %ebx, 16(rp)
- adc un, %eax
- mov %eax, 20(rp)
-
- decl vn
- jnz L(ol0)
- jmp L(done)
-
-C ================================================================
- ALIGN(16)
-L(lm1): movd -12(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd -8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -12(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd -4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(of1): paddq %mm0, %mm6
- sub $4, un
- movd %mm6, (rp)
- lea 16(up), up
- ja L(lm1)
-
- psrlq $32, %mm6
- movd %mm6, 4(rp)
-
- decl vn
- jz L(done)
- lea -16(rp), rp
-
-L(ol1): mov 28(%esp), un
- neg un
- lea 4(vp), vp
- movd (vp), %mm7 C read next V limb
- mov 24(%esp), up
- lea 24(rp,un,4), rp
-
- movd (up), %mm0
- pmuludq %mm7, %mm0
- sar $2, un
- movd %mm0, %ebx
- movd 4(up), %mm1
- pmuludq %mm7, %mm1
- xor %edx, %edx C zero edx and CF
- inc un
- jmp L(a1)
-
-L(la1): movd 4(up), %mm1
- adc $0, %edx
- add %eax, 12(rp)
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea 16(rp), rp
-L(a1): psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %ebx, (rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- movd %mm0, %ebx
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %eax, 4(rp)
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %ebx, 8(rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- jnz L(la1)
-
- adc un, %edx C un is zero here
- add %eax, 12(rp)
- movd %mm0, %ebx
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %eax
- adc un, %eax
- add %ebx, 16(rp)
- adc un, %eax
- mov %eax, 20(rp)
-
- decl vn
- jnz L(ol1)
- jmp L(done)
-
-C ================================================================
- ALIGN(16)
-L(lm2): movd -8(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd -4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(of2): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- sub $4, un
- movd %mm6, 4(rp)
- lea 16(up), up
- ja L(lm2)
-
- psrlq $32, %mm6
- movd %mm6, 8(rp)
-
- decl vn
- jz L(done)
- lea -12(rp), rp
-
-L(ol2): mov 28(%esp), un
- neg un
- lea 4(vp), vp
- movd (vp), %mm7 C read next V limb
- mov 24(%esp), up
- lea 12(rp,un,4), rp
-
- movd (up), %mm1
- pmuludq %mm7, %mm1
- sar $2, un
- movd 4(up), %mm0
- lea 4(up), up
- movd %mm1, %eax
- xor %edx, %edx C zero edx and CF
- jmp L(lo2)
-
-L(la2): movd 4(up), %mm1
- adc $0, %edx
- add %eax, 12(rp)
- movd %mm0, %ebx
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %ebx, (rp)
- psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- movd %mm0, %ebx
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %eax, 4(rp)
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %edx
- movd %mm1, %eax
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %ebx, 8(rp)
-L(lo2): psrlq $32, %mm1
- adc %edx, %eax
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- jnz L(la2)
-
- adc un, %edx C un is zero here
- add %eax, 12(rp)
- movd %mm0, %ebx
- psrlq $32, %mm0
- adc %edx, %ebx
- movd %mm0, %eax
- adc un, %eax
- add %ebx, 16(rp)
- adc un, %eax
- mov %eax, 20(rp)
-
- decl vn
- jnz L(ol2)
-C jmp L(done)
-
-C ================================================================
-L(done):
- emms
- pop %ebp
- pop %ebx
- pop %esi
- pop %edi
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sse2/popcount.asm b/gmp/mpn/x86/atom/sse2/popcount.asm
deleted file mode 100644
index 7847aec8e6..0000000000
--- a/gmp/mpn/x86/atom/sse2/popcount.asm
+++ /dev/null
@@ -1,35 +0,0 @@
-dnl Intel Atom mpn_popcount -- population count.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86/atom/sse2/sqr_basecase.asm b/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
deleted file mode 100644
index af19ed854d..0000000000
--- a/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
+++ /dev/null
@@ -1,634 +0,0 @@
-dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO
-C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
-C 4 large loops into one; we could use it for the outer loop branch.
-C * Optimise code outside of inner loops.
-C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
-C outer each loop. ("Overlapping software pipelining")
-C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
-C all pushes.
-C * Perhaps write special code for n < M, for some small M.
-C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
-C with even less pipelined code.
-C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
-C Consider breaking out earlier, saving high the cost of short loops.
-
-C void mpn_sqr_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xn);
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`n', `%ecx')
-
-define(`un', `%ebp')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sqr_basecase)
- push %edi
- push %esi
- mov 12(%esp), rp
- mov 16(%esp), up
- mov 20(%esp), n
-
- lea 4(rp), rp C write triangular product starting at rp[1]
- dec n
- movd (up), %mm7
-
- jz L(one)
- lea 4(up), up
- push %ebx
- push %ebp
- mov n, %eax
-
- movd (up), %mm0
- neg n
- pmuludq %mm7, %mm0
- pxor %mm6, %mm6
- mov n, un
-
- and $3, %eax
- jz L(of0)
- cmp $2, %eax
- jc L(of1)
- jz L(of2)
-
-C ================================================================
- jmp L(m3)
- ALIGN(16)
-L(lm3): movd -4(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(m3): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 4(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- add $4, un
- movd %mm6, 8(rp)
- lea 16(up), up
- js L(lm3)
-
- psrlq $32, %mm6
- movd %mm6, 12(rp)
-
- inc n
-C jz L(done)
- lea -12(up), up
- lea 4(rp), rp
- jmp L(ol2)
-
-C ================================================================
- ALIGN(16)
-L(lm0): movd (up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
-L(of0): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 4(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd 12(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, 8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- add $4, un
- movd %mm6, 12(rp)
- lea 16(up), up
- js L(lm0)
-
- psrlq $32, %mm6
- movd %mm6, 16(rp)
-
- inc n
-C jz L(done)
- lea -8(up), up
- lea 8(rp), rp
- jmp L(ol3)
-
-C ================================================================
- ALIGN(16)
-L(lm1): movd -12(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd -8(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -12(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd -4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(of1): paddq %mm0, %mm6
- add $4, un
- movd %mm6, (rp)
- lea 16(up), up
- js L(lm1)
-
- psrlq $32, %mm6
- movd %mm6, 4(rp)
-
- inc n
- jz L(done) C goes away when we add special n=2 code
- lea -20(up), up
- lea -4(rp), rp
- jmp L(ol0)
-
-C ================================================================
- ALIGN(16)
-L(lm2): movd -8(up), %mm0
- pmuludq %mm7, %mm0
- psrlq $32, %mm6
- lea 16(rp), rp
- paddq %mm0, %mm6
- movd -4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -8(rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- movd (up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, -4(rp)
- psrlq $32, %mm6
-L(of2): paddq %mm0, %mm6
- movd 4(up), %mm0
- pmuludq %mm7, %mm0
- movd %mm6, (rp)
- psrlq $32, %mm6
- paddq %mm0, %mm6
- add $4, un
- movd %mm6, 4(rp)
- lea 16(up), up
- js L(lm2)
-
- psrlq $32, %mm6
- movd %mm6, 8(rp)
-
- inc n
-C jz L(done)
- lea -16(up), up
-C lea (rp), rp
-C jmp L(ol1)
-
-C ================================================================
-
-L(ol1): lea 4(up,n,4), up
- movd (up), %mm7 C read next U invariant limb
- lea 8(rp,n,4), rp
- mov n, un
-
- movd 4(up), %mm1
- pmuludq %mm7, %mm1
- sar $2, un
- movd %mm1, %ebx
- inc un
- jz L(re1)
-
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- xor %edx, %edx C zero edx and CF
- jmp L(a1)
-
-L(la1): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %eax, (rp)
-L(a1): psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- movd %mm0, %eax
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %ebx, 4(rp)
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %eax, 8(rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- movd 4(up), %mm1
- jnz L(la1)
-
- adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %eax
- adc un, %eax
- add %ebx, 4(rp)
- adc un, %eax
- mov %eax, 8(rp)
-
- inc n
-
-C ================================================================
-
-L(ol0): lea (up,n,4), up
- movd 4(up), %mm7 C read next U invariant limb
- lea 4(rp,n,4), rp
- mov n, un
-
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- sar $2, un
- movd 12(up), %mm1
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- xor %edx, %edx C zero edx and CF
- jmp L(a0)
-
-L(la0): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- movd %mm0, %eax
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %ebx, 4(rp)
-L(a0): psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %eax, 8(rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- movd 4(up), %mm1
- jnz L(la0)
-
- adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %eax
- adc un, %eax
- add %ebx, 4(rp)
- adc un, %eax
- mov %eax, 8(rp)
-
- inc n
-
-C ================================================================
-
-L(ol3): lea 12(up,n,4), up
- movd -8(up), %mm7 C read next U invariant limb
- lea (rp,n,4), rp C put rp back
- mov n, un
-
- movd -4(up), %mm1
- pmuludq %mm7, %mm1
- sar $2, un
- movd %mm1, %ebx
- movd (up), %mm0
- xor %edx, %edx C zero edx and CF
- jmp L(a3)
-
-L(la3): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- movd %mm0, %eax
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %ebx, 4(rp)
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %eax, 8(rp)
-L(a3): psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- movd 4(up), %mm1
- jnz L(la3)
-
- adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %eax
- adc un, %eax
- add %ebx, 4(rp)
- adc un, %eax
- mov %eax, 8(rp)
-
- inc n
-
-C ================================================================
-
-L(ol2): lea 8(up,n,4), up
- movd -4(up), %mm7 C read next U invariant limb
- lea 12(rp,n,4), rp
- mov n, un
-
- movd (up), %mm0
- pmuludq %mm7, %mm0
- xor %edx, %edx
- sar $2, un
- movd 4(up), %mm1
- test un, un C clear carry
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- inc un
- jnz L(a2)
- jmp L(re2)
-
-L(la2): adc $0, %edx
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
-L(a2): psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- movd 8(up), %mm0
- pmuludq %mm7, %mm0
- adc $0, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- movd %mm0, %eax
- movd 12(up), %mm1
- pmuludq %mm7, %mm1
- adc $0, %edx
- add %ebx, 4(rp)
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- lea 16(up), up
- movd (up), %mm0
- adc $0, %edx
- add %eax, 8(rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %edx
- pmuludq %mm7, %mm0
- inc un
- movd 4(up), %mm1
- jnz L(la2)
-
- adc un, %edx C un is zero here
- add %ebx, 12(rp)
- movd %mm0, %eax
- pmuludq %mm7, %mm1
- lea 16(rp), rp
- psrlq $32, %mm0
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %eax
- adc un, %eax
- add %ebx, 4(rp)
- adc un, %eax
- mov %eax, 8(rp)
-
- inc n
- jmp L(ol1)
-
-C ================================================================
-L(re2): psrlq $32, %mm0
- movd (up), %mm7 C read next U invariant limb
- adc %edx, %eax
- movd %mm0, %edx
- movd %mm1, %ebx
- adc un, %edx
- add %eax, (rp)
- lea 4(rp), rp
- psrlq $32, %mm1
- adc %edx, %ebx
- movd %mm1, %eax
- movd 4(up), %mm1
- adc un, %eax
- add %ebx, (rp)
- pmuludq %mm7, %mm1
- adc un, %eax
- mov %eax, 4(rp)
- movd %mm1, %ebx
-
-L(re1): psrlq $32, %mm1
- add %ebx, 4(rp)
- movd %mm1, %eax
- adc un, %eax
- xor n, n C make n zeroness assumption below true
- mov %eax, 8(rp)
-
-L(done): C n is zero here
- mov 24(%esp), up
- mov 28(%esp), %eax
-
- movd (up), %mm0
- inc %eax
- pmuludq %mm0, %mm0
- lea 4(up), up
- mov 20(%esp), rp
- shr %eax
- movd %mm0, (rp)
- psrlq $32, %mm0
- lea -12(rp), rp
- mov %eax, 28(%esp)
- jnc L(odd)
-
- movd %mm0, %ebp
- movd (up), %mm0
- lea 8(rp), rp
- pmuludq %mm0, %mm0
- lea -4(up), up
- add 8(rp), %ebp
- movd %mm0, %edx
- adc 12(rp), %edx
- rcr n
- jmp L(ent)
-
-C ALIGN(16) C alignment seems irrelevant
-L(top): movd (up), %mm1
- adc n, n
- movd %mm0, %eax
- pmuludq %mm1, %mm1
- movd 4(up), %mm0
- adc (rp), %eax
- movd %mm1, %ebx
- pmuludq %mm0, %mm0
- psrlq $32, %mm1
- adc 4(rp), %ebx
- movd %mm1, %ebp
- movd %mm0, %edx
- adc 8(rp), %ebp
- adc 12(rp), %edx
- rcr n C FIXME: isn't this awfully slow on atom???
- adc %eax, (rp)
- adc %ebx, 4(rp)
-L(ent): lea 8(up), up
- adc %ebp, 8(rp)
- psrlq $32, %mm0
- adc %edx, 12(rp)
-L(odd): decl 28(%esp)
- lea 16(rp), rp
- jnz L(top)
-
-L(end): adc n, n
- movd %mm0, %eax
- adc n, %eax
- mov %eax, (rp)
-
-L(rtn): emms
- pop %ebp
- pop %ebx
- pop %esi
- pop %edi
- ret
-
-L(one): pmuludq %mm7, %mm7
- movq %mm7, -4(rp)
- emms
- pop %esi
- pop %edi
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/atom/sublsh1_n.asm b/gmp/mpn/x86/atom/sublsh1_n.asm
deleted file mode 100644
index d3e7e5b5cb..0000000000
--- a/gmp/mpn/x86/atom/sublsh1_n.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
-include_mpn(`x86/k7/sublsh1_n.asm')
diff --git a/gmp/mpn/x86/atom/sublsh2_n.asm b/gmp/mpn/x86/atom/sublsh2_n.asm
deleted file mode 100644
index 79405cf9f4..0000000000
--- a/gmp/mpn/x86/atom/sublsh2_n.asm
+++ /dev/null
@@ -1,57 +0,0 @@
-dnl Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
-
-dnl Contributed to the GNU project by Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 30)
-
-ifdef(`OPERATION_addlsh2_n', `
- define(M4_inst, adcl)
- define(M4_opp, subl)
- define(M4_function, mpn_addlsh2_n)
- define(M4_function_c, mpn_addlsh2_nc)
- define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
- define(M4_ip_function, mpn_addlsh2_n_ip1)
-',`ifdef(`OPERATION_sublsh2_n', `
- define(M4_inst, sbbl)
- define(M4_opp, addl)
- define(M4_function, mpn_sublsh2_n)
- define(M4_function_c, mpn_sublsh2_nc)
- define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
- define(M4_ip_function, mpn_sublsh2_n_ip1)
-',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
-
-include_mpn(`x86/atom/aorslshC_n.asm')
diff --git a/gmp/mpn/x86/bd1/gmp-mparam.h b/gmp/mpn/x86/bd1/gmp-mparam.h
deleted file mode 100644
index 7d80a1cb4c..0000000000
--- a/gmp/mpn/x86/bd1/gmp-mparam.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 3600 MHz Bulldozer Zambezi */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 27
-
-#define MUL_TOOM22_THRESHOLD 32
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 154
-#define MUL_TOOM6H_THRESHOLD 230
-#define MUL_TOOM8H_THRESHOLD 354
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 48
-#define SQR_TOOM3_THRESHOLD 87
-#define SQR_TOOM4_THRESHOLD 204
-#define SQR_TOOM6_THRESHOLD 315
-#define SQR_TOOM8_THRESHOLD 430
-
-#define MULMID_TOOM42_THRESHOLD 48
-
-#define MULMOD_BNM1_THRESHOLD 21
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 840 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 840, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
- { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 31, 6}, \
- { 63, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \
- { 31, 7}, { 63, 8}, { 39, 7}, { 79, 9}, \
- { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 271,11}, \
- { 159,10}, { 319, 9}, { 639,11}, { 191,10}, \
- { 383, 9}, { 767,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 639,12}, { 191,11}, { 383,10}, \
- { 799,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \
- { 383,11}, { 799,10}, { 1599,11}, { 863,10}, \
- { 1727,12}, { 447,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \
- { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \
- { 831,11}, { 1727,10}, { 3455,14}, { 255,13}, \
- { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1919,11}, { 3839,12}, { 1983,11}, { 3967,10}, \
- { 7935,14}, { 511,13}, { 1023,12}, { 2239,13}, \
- { 1151,12}, { 2495,11}, { 4991,13}, { 1279,12}, \
- { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
- { 1535,12}, { 3071,13}, { 1663,12}, { 3455,13}, \
- { 1791,12}, { 3583,13}, { 1919,12}, { 3967,11}, \
- { 7935,15}, { 511,14}, { 1023,13}, { 2175,12}, \
- { 4479,13}, { 2431,12}, { 4991,14}, { 1279,13}, \
- { 2943,12}, { 6015,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 160
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 690 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 690, 5}, { 28, 6}, { 15, 5}, { 32, 6}, \
- { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
- { 43, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 39, 7}, { 79, 8}, { 43, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 127,10}, { 79, 9}, { 167,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 271,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 191,10}, { 383,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \
- { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
- { 639,12}, { 191,11}, { 383,10}, { 799,11}, \
- { 415,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,12}, { 447,11}, { 927,13}, \
- { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \
- { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \
- { 2239,10}, { 4479,12}, { 1215,11}, { 2431,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1727,11}, { 3455,13}, { 895,12}, { 1983,11}, \
- { 3967,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,11}, { 4991,10}, \
- { 9983,13}, { 1279,12}, { 2623,13}, { 1407,12}, \
- { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
- { 1791,12}, { 3583,13}, { 1919,12}, { 3967,15}, \
- { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
- { 2431,12}, { 4991,11}, { 9983,14}, { 1279,13}, \
- { 2687,12}, { 5375,13}, { 2943,12}, { 5887,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 166
-#define SQR_FFT_THRESHOLD 6784
-
-#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 31
-#define MULLO_MUL_N_THRESHOLD 14709
-
-#define DC_DIV_QR_THRESHOLD 53
-#define DC_DIVAPPR_Q_THRESHOLD 230
-#define DC_BDIV_QR_THRESHOLD 50
-#define DC_BDIV_Q_THRESHOLD 136
-
-#define INV_MULMOD_BNM1_THRESHOLD 78
-#define INV_NEWTON_THRESHOLD 202
-#define INV_APPR_THRESHOLD 202
-
-#define BINV_NEWTON_THRESHOLD 236
-#define REDC_1_TO_REDC_N_THRESHOLD 55
-
-#define MU_DIV_QR_THRESHOLD 1442
-#define MU_DIVAPPR_Q_THRESHOLD 1652
-#define MUPI_DIV_QR_THRESHOLD 81
-#define MU_BDIV_QR_THRESHOLD 1787
-#define MU_BDIV_Q_THRESHOLD 1685
-
-#define POWM_SEC_TABLE 1,22,194,376,692,2657
-
-#define MATRIX22_STRASSEN_THRESHOLD 21
-#define HGCD_THRESHOLD 85
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 4455
-#define GCD_DC_THRESHOLD 456
-#define GCDEXT_DC_THRESHOLD 345
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 100
-#define SET_STR_PRECOMPUTE_THRESHOLD 960
-
-#define FAC_DSC_THRESHOLD 208
-#define FAC_ODD_THRESHOLD 26
diff --git a/gmp/mpn/x86/bd2/gmp-mparam.h b/gmp/mpn/x86/bd2/gmp-mparam.h
deleted file mode 100644
index c5a53f2f9f..0000000000
--- a/gmp/mpn/x86/bd2/gmp-mparam.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 4000 MHz Piledriver Vishera */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.8 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 19
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 24
-
-#define MUL_TOOM22_THRESHOLD 30
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 153
-#define MUL_TOOM6H_THRESHOLD 222
-#define MUL_TOOM8H_THRESHOLD 357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 38
-#define SQR_TOOM3_THRESHOLD 89
-#define SQR_TOOM4_THRESHOLD 196
-#define SQR_TOOM6_THRESHOLD 290
-#define SQR_TOOM8_THRESHOLD 454
-
-#define MULMID_TOOM42_THRESHOLD 68
-
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 22
-
-#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 636, 5}, { 27, 6}, { 27, 7}, { 15, 6}, \
- { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
- { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \
- { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95, 9}, { 191,11}, { 63,10}, { 127, 6}, \
- { 2111, 5}, { 4351, 6}, { 2239, 7}, { 1215, 9}, \
- { 311, 8}, { 639,10}, { 175, 8}, { 703,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159, 9}, { 671,11}, { 191,10}, { 383, 9}, \
- { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,12}, { 447,11}, { 895,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 767,11}, { 1599,12}, \
- { 831,11}, { 1727,10}, { 3455,12}, { 895,14}, \
- { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
- { 1087,11}, { 2239,10}, { 4479,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1919,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
- { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
- { 2175,12}, { 4479,13}, { 2431,12}, { 4991,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
- { 3455,14}, { 1791,13}, { 3967,12}, { 7935,11}, \
- { 15871,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
- { 2303,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 172
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 606 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 606, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
- { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95, 9}, { 191,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191, 6}, { 3135, 5}, { 6399, 6}, \
- { 3455, 8}, { 895, 9}, { 479, 8}, { 991,10}, \
- { 255, 9}, { 575,11}, { 159, 9}, { 639,10}, \
- { 335, 8}, { 1343,10}, { 351,11}, { 191, 9}, \
- { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543,11}, { 287,10}, { 607, 9}, { 1215,10}, \
- { 671,12}, { 191,11}, { 383,10}, { 767, 9}, \
- { 1535,10}, { 799,11}, { 415,10}, { 863,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
- { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
- { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
- { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
- { 927,13}, { 255,12}, { 511,11}, { 1087,12}, \
- { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
- { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,12}, { 895,11}, \
- { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1087,11}, { 2239,10}, { 4479,12}, { 1215,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
- { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \
- { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
- { 3455,13}, { 1791,12}, { 3583,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
- { 2431,12}, { 4991,14}, { 1279,13}, { 2943,12}, \
- { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
- { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
- { 2303,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 160
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 34
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 67
-#define DC_DIVAPPR_Q_THRESHOLD 196
-#define DC_BDIV_QR_THRESHOLD 67
-#define DC_BDIV_Q_THRESHOLD 112
-
-#define INV_MULMOD_BNM1_THRESHOLD 70
-#define INV_NEWTON_THRESHOLD 262
-#define INV_APPR_THRESHOLD 222
-
-#define BINV_NEWTON_THRESHOLD 288
-#define REDC_1_TO_REDC_N_THRESHOLD 67
-
-#define MU_DIV_QR_THRESHOLD 1718
-#define MU_DIVAPPR_Q_THRESHOLD 1652
-#define MUPI_DIV_QR_THRESHOLD 122
-#define MU_BDIV_QR_THRESHOLD 1387
-#define MU_BDIV_Q_THRESHOLD 1528
-
-#define POWM_SEC_TABLE 1,16,69,508,1378,2657,2825
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 61
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 492
-#define GCDEXT_DC_THRESHOLD 345
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 9
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 189
-#define SET_STR_PRECOMPUTE_THRESHOLD 541
-
-#define FAC_DSC_THRESHOLD 141
-#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/x86/bdiv_dbm1c.asm b/gmp/mpn/x86/bdiv_dbm1c.asm
index 0288c475cd..dbee28fd94 100644
--- a/gmp/mpn/x86/bdiv_dbm1c.asm
+++ b/gmp/mpn/x86/bdiv_dbm1c.asm
@@ -1,51 +1,32 @@
dnl x86 mpn_bdiv_dbm1.
-dnl Copyright 2008, 2011 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
-include(`../config.m4')
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C P5
-C P6 model 0-8,10-12)
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 5.1
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood) 13.67
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom
-C AMD K6
-C AMD K7 3.5
-C AMD K8
-C AMD K10
+include(`../config.m4')
+C cycles/limb
+C K7: 3.5
+C P4 m0: ?
+C P4 m1: ?
+C P4 m2: 13.67
+C P4 m3: ?
+C P4 m4: ?
+C P6-13: 5.1
C TODO
C * Optimize for more x86 processors
@@ -76,17 +57,18 @@ PROLOGUE(mpn_bdiv_dbm1c)
cmp $2, %eax
jc L(b1)
jz L(b2)
-
-L(b3): lea -8(%esi), %esi
- lea 8(%edi), %edi
- add $-3, %ebp
- jmp L(3)
+ jmp L(b3)
L(b0): mov 4(%esi), %eax
lea -4(%esi), %esi
lea 12(%edi), %edi
add $-4, %ebp
jmp L(0)
+L(b3):
+ lea -8(%esi), %esi
+ lea 8(%edi), %edi
+ add $-3, %ebp
+ jmp L(3)
L(b2): mov 4(%esi), %eax
lea 4(%esi), %esi
@@ -95,7 +77,8 @@ L(b2): mov 4(%esi), %eax
jmp L(2)
ALIGN(8)
-L(top): mov 4(%esi), %eax
+L(top):
+ mov 4(%esi), %eax
mul %ecx
lea 16(%edi), %edi
sub %eax, %ebx
diff --git a/gmp/mpn/x86/bdiv_q_1.asm b/gmp/mpn/x86/bdiv_q_1.asm
deleted file mode 100644
index 825cd296a1..0000000000
--- a/gmp/mpn/x86/bdiv_q_1.asm
+++ /dev/null
@@ -1,208 +0,0 @@
-dnl x86 mpn_bdiv_q_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P54 30.0
-C P55 29.0
-C P6 13.0 odd divisor, 12.0 even (strangely)
-C K6 14.0
-C K7 12.0
-C P4 42.0
-
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_INVERSE,`PARAM_SRC')
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
-
- ALIGN(16)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SHIFT, %ecx
- pushl %ebp FRAME_pushl()
-
- movl PARAM_INVERSE, %eax
- movl PARAM_SIZE, %ebp
- pushl %ebx FRAME_pushl()
-L(common):
- pushl %edi FRAME_pushl()
- pushl %esi FRAME_pushl()
-
- movl PARAM_SRC, %esi
- movl PARAM_DST, %edi
-
- leal (%esi,%ebp,4), %esi C src end
- leal (%edi,%ebp,4), %edi C dst end
- negl %ebp C -size
-
- movl %eax, VAR_INVERSE
- movl (%esi,%ebp,4), %eax C src[0]
-
- xorl %ebx, %ebx
- xorl %edx, %edx
-
- incl %ebp
- jz L(one)
-
- movl (%esi,%ebp,4), %edx C src[1]
-
- shrdl( %cl, %edx, %eax)
-
- movl VAR_INVERSE, %edx
- jmp L(entry)
-
-
- ALIGN(8)
- nop C k6 code alignment
- nop
-L(top):
- C eax q
- C ebx carry bit, 0 or -1
- C ecx shift
- C edx carry limb
- C esi src end
- C edi dst end
- C ebp counter, limbs, negative
-
- movl -4(%esi,%ebp,4), %eax
- subl %ebx, %edx C accumulate carry bit
-
- movl (%esi,%ebp,4), %ebx
-
- shrdl( %cl, %ebx, %eax)
-
- subl %edx, %eax C apply carry limb
- movl VAR_INVERSE, %edx
-
- sbbl %ebx, %ebx
-
-L(entry):
- imull %edx, %eax
-
- movl %eax, -4(%edi,%ebp,4)
- movl PARAM_DIVISOR, %edx
-
- mull %edx
-
- incl %ebp
- jnz L(top)
-
-
- movl -4(%esi), %eax C src high limb
-L(one):
- shrl %cl, %eax
- popl %esi FRAME_popl()
-
- addl %ebx, %eax C apply carry bit
-
- subl %edx, %eax C apply carry limb
-
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi)
-
- popl %edi
- popl %ebx
- popl %ebp
-
- ret
-
-EPILOGUE()
-
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-
- ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_DIVISOR, %eax
- pushl %ebp FRAME_pushl()
-
- movl $-1, %ecx C shift count
- movl PARAM_SIZE, %ebp
-
- pushl %ebx FRAME_pushl()
-
-L(strip_twos):
- incl %ecx
-
- shrl %eax
- jnc L(strip_twos)
-
- leal 1(%eax,%eax), %ebx C d without twos
- andl $127, %eax C d/2, 7 bits
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %edx)
- movzbl (%eax,%edx), %eax C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- leal (%eax,%eax), %edx C 2*inv
- movl %ebx, PARAM_DIVISOR C d without twos
- imull %eax, %eax C inv*inv
- imull %ebx, %eax C inv*inv*d
- subl %eax, %edx C inv = 2*inv - inv*inv*d
-
- leal (%edx,%edx), %eax C 2*inv
- imull %edx, %edx C inv*inv
- imull %ebx, %edx C inv*inv*d
- subl %edx, %eax C inv = 2*inv - inv*inv*d
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- imull PARAM_DIVISOR, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- jmp L(common)
-EPILOGUE()
-
diff --git a/gmp/mpn/x86/bobcat/gmp-mparam.h b/gmp/mpn/x86/bobcat/gmp-mparam.h
deleted file mode 100644
index 198081f9fd..0000000000
--- a/gmp/mpn/x86/bobcat/gmp-mparam.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 1600 MHz AMD Bobcat Zacate E-350 */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 40
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 90
-#define MUL_TOOM44_THRESHOLD 154
-#define MUL_TOOM6H_THRESHOLD 270
-#define MUL_TOOM8H_THRESHOLD 490
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 95
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 110
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 38
-#define SQR_TOOM3_THRESHOLD 121
-#define SQR_TOOM4_THRESHOLD 212
-#define SQR_TOOM6_THRESHOLD 303
-#define SQR_TOOM8_THRESHOLD 454
-
-#define MULMID_TOOM42_THRESHOLD 74
-
-#define MULMOD_BNM1_THRESHOLD 18
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 660 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 660, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
- { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
- { 159, 9}, { 639,10}, { 335, 9}, { 671,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799, 9}, { 1599,11}, { 415,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,12}, { 447,11}, { 991,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \
- { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
- { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \
- { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
- { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 139
-#define MUL_FFT_THRESHOLD 7552
-
-#define SQR_FFT_MODF_THRESHOLD 606 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 606, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543, 8}, { 1087,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671, 8}, { 1343,10}, \
- { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 399, 9}, { 799,10}, { 415, 9}, { 831,12}, \
- { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 319,10}, { 671, 9}, { 1343,12}, \
- { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
- { 831,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,12}, { 447,11}, { 991,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 767,11}, { 1599,12}, \
- { 831,11}, { 1727,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \
- { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
- { 2431,13}, { 1407,14}, { 767,13}, { 1663,12}, \
- { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
- { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \
- { 2943,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 147
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 45
-#define MULLO_MUL_N_THRESHOLD 14281
-
-#define DC_DIV_QR_THRESHOLD 71
-#define DC_DIVAPPR_Q_THRESHOLD 238
-#define DC_BDIV_QR_THRESHOLD 67
-#define DC_BDIV_Q_THRESHOLD 151
-
-#define INV_MULMOD_BNM1_THRESHOLD 66
-#define INV_NEWTON_THRESHOLD 228
-#define INV_APPR_THRESHOLD 222
-
-#define BINV_NEWTON_THRESHOLD 270
-#define REDC_1_TO_REDC_N_THRESHOLD 71
-
-#define MU_DIV_QR_THRESHOLD 1718
-#define MU_DIVAPPR_Q_THRESHOLD 1718
-#define MUPI_DIV_QR_THRESHOLD 91
-#define MU_BDIV_QR_THRESHOLD 1589
-#define MU_BDIV_Q_THRESHOLD 1718
-
-#define POWM_SEC_TABLE 1,16,96,416,1185
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 88
-#define HGCD_APPR_THRESHOLD 137
-#define HGCD_REDUCE_THRESHOLD 3664
-#define GCD_DC_THRESHOLD 465
-#define GCDEXT_DC_THRESHOLD 345
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 18
-#define GET_STR_PRECOMPUTE_THRESHOLD 34
-#define SET_STR_DC_THRESHOLD 270
-#define SET_STR_PRECOMPUTE_THRESHOLD 828
-
-#define FAC_DSC_THRESHOLD 256
-#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/cnd_aors_n.asm b/gmp/mpn/x86/cnd_aors_n.asm
deleted file mode 100644
index 74f4917ecc..0000000000
--- a/gmp/mpn/x86/cnd_aors_n.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl X86 mpn_cnd_add_n, mpn_cnd_sub_n
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 5.4
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 14.5
-C P4 model 3-4 (Prescott) 21
-C Intel atom 11
-C AMD K6 ?
-C AMD K7 3.4
-C AMD K8 ?
-
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebp')
-define(`n', `%ecx')
-define(`cnd', `20(%esp)')
-define(`cy', `%edx')
-
-ifdef(`OPERATION_cnd_add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_cnd_add_n)')
-ifdef(`OPERATION_cnd_sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_cnd_sub_n)')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- add $-16, %esp
- mov %ebp, (%esp)
- mov %ebx, 4(%esp)
- mov %esi, 8(%esp)
- mov %edi, 12(%esp)
-
- C make cnd into a full mask
- mov cnd, %eax
- neg %eax
- sbb %eax, %eax
- mov %eax, cnd
-
- C load parameters into registers
- mov 24(%esp), rp
- mov 28(%esp), up
- mov 32(%esp), vp
- mov 36(%esp), n
-
- mov (vp), %eax
- mov (up), %ebx
-
- C put operand pointers just beyond their last limb
- lea (vp,n,4), vp
- lea (up,n,4), up
- lea -4(rp,n,4), rp
- neg n
-
- and cnd, %eax
- ADDSUB %eax, %ebx
- sbb cy, cy
- inc n
- je L(end)
-
- ALIGN(16)
-L(top): mov (vp,n,4), %eax
- and cnd, %eax
- mov %ebx, (rp,n,4)
- mov (up,n,4), %ebx
- add cy, cy
- ADCSBB %eax, %ebx
- sbb cy, cy
- inc n
- jne L(top)
-
-L(end): mov %ebx, (rp)
- xor %eax, %eax
- sub cy, %eax
-
- mov (%esp), %ebp
- mov 4(%esp), %ebx
- mov 8(%esp), %esi
- mov 12(%esp), %edi
- add $16, %esp
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/copyd.asm b/gmp/mpn/x86/copyd.asm
index 51fa19568b..4ce3bbbc69 100644
--- a/gmp/mpn/x86/copyd.asm
+++ b/gmp/mpn/x86/copyd.asm
@@ -1,42 +1,31 @@
dnl x86 mpn_copyd -- copy limb vector, decrementing.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb startup (approx)
-C P5 1.0 40
-C P6 2.4 70
-C K6 1.0 55
-C K7 1.3 75
-C P4 2.6 175
+C P5: 1.0 40
+C P6 2.4 70
+C K6 1.0 55
+C K7: 1.3 75
+C P4: 2.6 175
C
C (Startup time includes some function call overheads.)
diff --git a/gmp/mpn/x86/copyi.asm b/gmp/mpn/x86/copyi.asm
index f6b0354b4f..c6bbaeee65 100644
--- a/gmp/mpn/x86/copyi.asm
+++ b/gmp/mpn/x86/copyi.asm
@@ -1,42 +1,31 @@
dnl x86 mpn_copyi -- copy limb vector, incrementing.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb startup (approx)
-C P5 1.0 35
-C P6 0.75 45
-C K6 1.0 30
-C K7 1.3 65
-C P4 1.0 120
+C P5: 1.0 35
+C P6 0.75 45
+C K6 1.0 30
+C K7: 1.3 65
+C P4: 1.0 120
C
C (Startup time includes some function call overheads.)
diff --git a/gmp/mpn/x86/core2/gmp-mparam.h b/gmp/mpn/x86/core2/gmp-mparam.h
deleted file mode 100644
index b370eb5877..0000000000
--- a/gmp/mpn/x86/core2/gmp-mparam.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2133 MHz Core 2 (65nm) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-14, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 19
-
-#define MUL_TOOM22_THRESHOLD 26
-#define MUL_TOOM33_THRESHOLD 90
-#define MUL_TOOM44_THRESHOLD 144
-#define MUL_TOOM6H_THRESHOLD 286
-#define MUL_TOOM8H_THRESHOLD 430
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 114
-#define SQR_TOOM4_THRESHOLD 178
-#define SQR_TOOM6_THRESHOLD 262
-#define SQR_TOOM8_THRESHOLD 357
-
-#define MULMID_TOOM42_THRESHOLD 66
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 21
-
-#define MUL_FFT_MODF_THRESHOLD 600 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 600, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
- { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
- { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
- { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 43, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
- { 95, 9}, { 191,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 271, 9}, { 543, 8}, { 1087,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
- { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 399, 9}, { 799,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \
- { 351,12}, { 191,11}, { 383,10}, { 799, 9}, \
- { 1599,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \
- { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
- { 799,10}, { 1599,11}, { 863,10}, { 1727,12}, \
- { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
- { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,11}, { 2239,10}, \
- { 4479,12}, { 1215,13}, { 639,12}, { 1471,11}, \
- { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
- { 1983,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2623,13}, { 1407,12}, { 2815,14}, { 767,13}, \
- { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 149
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 500, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127,10}, { 79, 9}, { 159,10}, \
- { 95,11}, { 63,10}, { 143, 9}, { 287,10}, \
- { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
- { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
- { 831,10}, { 431,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 383,10}, { 799,11}, \
- { 415,10}, { 863,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,12}, { 447,11}, { 959,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 831,11}, { 1727,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \
- { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
- { 2495,13}, { 1407,12}, { 2943,14}, { 767,13}, \
- { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 145
-#define SQR_FFT_THRESHOLD 5312
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 29
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 21
-#define DC_DIVAPPR_Q_THRESHOLD 50
-#define DC_BDIV_QR_THRESHOLD 79
-#define DC_BDIV_Q_THRESHOLD 174
-
-#define INV_MULMOD_BNM1_THRESHOLD 50
-#define INV_NEWTON_THRESHOLD 39
-#define INV_APPR_THRESHOLD 37
-
-#define BINV_NEWTON_THRESHOLD 318
-#define REDC_1_TO_REDC_N_THRESHOLD 87
-
-#define MU_DIV_QR_THRESHOLD 1099
-#define MU_DIVAPPR_Q_THRESHOLD 792
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1442
-#define MU_BDIV_Q_THRESHOLD 1589
-
-#define POWM_SEC_TABLE 3,32,95,480,597,2657
-
-#define MATRIX22_STRASSEN_THRESHOLD 21
-#define HGCD_THRESHOLD 83
-#define HGCD_APPR_THRESHOLD 159
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 309
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 10
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 442
-#define SET_STR_PRECOMPUTE_THRESHOLD 1104
-
-#define FAC_DSC_THRESHOLD 155
-#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/coreihwl/gmp-mparam.h b/gmp/mpn/x86/coreihwl/gmp-mparam.h
deleted file mode 100644
index e2b289cc3c..0000000000
--- a/gmp/mpn/x86/coreihwl/gmp-mparam.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* x86/coreihwl gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2900 MHz Core i5 Haswell */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 16
-#define MOD_1_UNNORM_THRESHOLD 13
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 15
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 19
-
-#define MUL_TOOM22_THRESHOLD 27
-#define MUL_TOOM33_THRESHOLD 90
-#define MUL_TOOM44_THRESHOLD 218
-#define MUL_TOOM6H_THRESHOLD 318
-#define MUL_TOOM8H_THRESHOLD 490
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 44
-#define SQR_TOOM3_THRESHOLD 137
-#define SQR_TOOM4_THRESHOLD 242
-#define SQR_TOOM6_THRESHOLD 351
-#define SQR_TOOM8_THRESHOLD 597
-
-#define MULMID_TOOM42_THRESHOLD 98
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 21
-
-#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 630, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \
- { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
- { 111,11}, { 63,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
- { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1087,11}, \
- { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1471,11}, { 2943,10}, { 5887,13}, { 767,12}, \
- { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
- { 2495,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
- { 2943,11}, { 5887,14}, { 767,13}, { 1535,12}, \
- { 3071,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
- { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
- { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \
- { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
- { 2303,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 168
-#define MUL_FFT_THRESHOLD 7424
-
-#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 530, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 55,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
- { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,12}, { 127,11}, { 255,10}, { 511, 9}, \
- { 1023,10}, { 543,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,11}, { 351,12}, { 191,11}, \
- { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
- { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
- { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \
- { 383,11}, { 799,10}, { 1599,11}, { 863,10}, \
- { 1727,12}, { 447,11}, { 991,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,12}, { 959,11}, { 1983,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \
- { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \
- { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \
- { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
- { 1535,12}, { 3071,13}, { 1663,12}, { 3455,13}, \
- { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
- { 2175,12}, { 4479,13}, { 2431,12}, { 4863,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
- { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
- { 2047,13}, { 4479,14}, { 2303,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 170
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 57
-#define MULLO_MUL_N_THRESHOLD 14281
-
-#define DC_DIV_QR_THRESHOLD 23
-#define DC_DIVAPPR_Q_THRESHOLD 63
-#define DC_BDIV_QR_THRESHOLD 87
-#define DC_BDIV_Q_THRESHOLD 204
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 75
-#define INV_APPR_THRESHOLD 67
-
-#define BINV_NEWTON_THRESHOLD 296
-#define REDC_1_TO_REDC_N_THRESHOLD 79
-
-#define MU_DIV_QR_THRESHOLD 872
-#define MU_DIVAPPR_Q_THRESHOLD 654
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1858
-#define MU_BDIV_Q_THRESHOLD 2089
-
-#define POWM_SEC_TABLE 1,17,127,508,1603
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 61
-#define HGCD_APPR_THRESHOLD 60
-#define HGCD_REDUCE_THRESHOLD 3810
-#define GCD_DC_THRESHOLD 263
-#define GCDEXT_DC_THRESHOLD 278
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 527
-#define SET_STR_PRECOMPUTE_THRESHOLD 1178
-
-#define FAC_DSC_THRESHOLD 187
-#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/coreinhm/gmp-mparam.h b/gmp/mpn/x86/coreinhm/gmp-mparam.h
deleted file mode 100644
index 13289c0c23..0000000000
--- a/gmp/mpn/x86/coreinhm/gmp-mparam.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2667 MHz Core i7 Nehalem */
-/* FFT tuning limit = 100000000 */
-/* Generated by tuneup.c, 2014-03-19, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 25
-#define MOD_1_UNNORM_THRESHOLD 15
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 18
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 15
-
-#define MUL_TOOM22_THRESHOLD 26
-#define MUL_TOOM33_THRESHOLD 89
-#define MUL_TOOM44_THRESHOLD 214
-#define MUL_TOOM6H_THRESHOLD 327
-#define MUL_TOOM8H_THRESHOLD 466
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 159
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 95
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 44
-#define SQR_TOOM3_THRESHOLD 145
-#define SQR_TOOM4_THRESHOLD 232
-#define SQR_TOOM6_THRESHOLD 342
-#define SQR_TOOM8_THRESHOLD 502
-
-#define MULMID_TOOM42_THRESHOLD 78
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 21
-
-#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 606, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
- { 15, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
- { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
- { 39, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
- { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
- { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
- { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399,12}, { 127,11}, { 255,10}, \
- { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
- { 607,11}, { 319,10}, { 639,12}, { 191,11}, \
- { 383,10}, { 767,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \
- { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 927,10}, \
- { 1855,11}, { 991,13}, { 255,12}, { 511,11}, \
- { 1119,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,12}, { 895,11}, { 1855,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1023,11}, { 2111,12}, \
- { 1087,11}, { 2239,10}, { 4479,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \
- { 1727,11}, { 3455,13}, { 895,12}, { 1983,11}, \
- { 3967,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
- { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3455,13}, { 1919,12}, { 3967,15}, { 511,14}, \
- { 1023,13}, { 2175,12}, { 4479,13}, { 2431,12}, \
- { 4991,14}, { 1279,13}, { 2687,12}, { 5503,13}, \
- { 2943,12}, { 6015,14}, { 1535,13}, { 3455,14}, \
- { 1791,13}, { 3967,12}, { 7935,15}, { 1023,14}, \
- { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
- { 9983,14}, { 2559,13}, { 5503,14}, { 2815,13}, \
- { 6015,15}, { 1535,14}, { 3839,13}, { 7935,16}, \
- { 1023,15}, { 2047,14}, { 4095,13}, { 8191,12}, \
- { 16383,11}, { 32767,10}, { 65535, 9}, { 131071, 8}, \
- { 256, 9}, { 512,10}, { 1024,11}, { 2048,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 192
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 555 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 555, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
- { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
- { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
- { 95,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511, 8}, { 1023,10}, { 271, 9}, { 543,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
- { 415,12}, { 127,11}, { 255,10}, { 511, 9}, \
- { 1023,10}, { 543,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,11}, { 351,12}, { 191,11}, \
- { 383,10}, { 799,11}, { 415,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
- { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
- { 799,10}, { 1599,11}, { 863,10}, { 1727,12}, \
- { 447,11}, { 991,10}, { 1983,13}, { 255,12}, \
- { 511,11}, { 1023,10}, { 2047,11}, { 1087,12}, \
- { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
- { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
- { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
- { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \
- { 1983,14}, { 255,13}, { 511,12}, { 1023,11}, \
- { 2111,12}, { 1087,11}, { 2239,10}, { 4479,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
- { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \
- { 895,12}, { 1983,11}, { 3967,14}, { 511,13}, \
- { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
- { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \
- { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
- { 1919,12}, { 3967,15}, { 511,14}, { 1023,13}, \
- { 2175,12}, { 4479,13}, { 2431,12}, { 4863,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
- { 3455,14}, { 1791,13}, { 3967,12}, { 7935,15}, \
- { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
- { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
- { 1535,14}, { 3327,13}, { 6655,14}, { 3839,13}, \
- { 7935,16}, { 1023,15}, { 2047,14}, { 4095,13}, \
- { 8191,12}, { 16383,11}, { 32767,10}, { 65535, 9}, \
- { 131071, 8}, { 256, 9}, { 512,10}, { 1024,11}, \
- { 2048,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 201
-#define SQR_FFT_THRESHOLD 5312
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 38
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 22
-#define DC_DIVAPPR_Q_THRESHOLD 43
-#define DC_BDIV_QR_THRESHOLD 78
-#define DC_BDIV_Q_THRESHOLD 157
-
-#define INV_MULMOD_BNM1_THRESHOLD 50
-#define INV_NEWTON_THRESHOLD 15
-#define INV_APPR_THRESHOLD 18
-
-#define BINV_NEWTON_THRESHOLD 351
-#define REDC_1_TO_REDC_N_THRESHOLD 84
-
-#define MU_DIV_QR_THRESHOLD 889
-#define MU_DIVAPPR_Q_THRESHOLD 483
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1589
-#define MU_BDIV_Q_THRESHOLD 1787
-
-#define POWM_SEC_TABLE 2,25,95,473,1357
-
-#define MATRIX22_STRASSEN_THRESHOLD 20
-#define HGCD_THRESHOLD 52
-#define HGCD_APPR_THRESHOLD 51
-#define HGCD_REDUCE_THRESHOLD 3524
-#define GCD_DC_THRESHOLD 213
-#define GCDEXT_DC_THRESHOLD 249
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 145
-#define SET_STR_PRECOMPUTE_THRESHOLD 545
-
-#define FAC_DSC_THRESHOLD 91
-#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/x86/coreisbr/gmp-mparam.h b/gmp/mpn/x86/coreisbr/gmp-mparam.h
deleted file mode 100644
index 9b227a71ba..0000000000
--- a/gmp/mpn/x86/coreisbr/gmp-mparam.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 3300 MHz Core i5 Sandy Bridge */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 18
-#define MOD_1_UNNORM_THRESHOLD 11
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 16
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 19
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 99
-#define MUL_TOOM44_THRESHOLD 160
-#define MUL_TOOM6H_THRESHOLD 268
-#define MUL_TOOM8H_THRESHOLD 490
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 108
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 137
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 48
-#define SQR_TOOM3_THRESHOLD 105
-#define SQR_TOOM4_THRESHOLD 256
-#define SQR_TOOM6_THRESHOLD 366
-#define SQR_TOOM8_THRESHOLD 562
-
-#define MULMID_TOOM42_THRESHOLD 98
-
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 636, 5}, { 27, 6}, { 28, 7}, { 15, 6}, \
- { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
- { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \
- { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,12}, { 447,11}, { 959,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 767,11}, { 1599,12}, \
- { 831,11}, { 1727,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
- { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
- { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \
- { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 149
-#define MUL_FFT_THRESHOLD 7424
-
-#define SQR_FFT_MODF_THRESHOLD 555 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 555, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
- { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 543,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
- { 799,10}, { 415,12}, { 127,11}, { 255,10}, \
- { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 799,11}, { 415,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 959,10}, \
- { 1919,11}, { 991,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,12}, { 959,11}, { 1919,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \
- { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \
- { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \
- { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
- { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
- { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2943,12}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 159
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 14281
-
-#define DC_DIV_QR_THRESHOLD 25
-#define DC_DIVAPPR_Q_THRESHOLD 43
-#define DC_BDIV_QR_THRESHOLD 99
-#define DC_BDIV_Q_THRESHOLD 240
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 14
-#define INV_APPR_THRESHOLD 13
-
-#define BINV_NEWTON_THRESHOLD 363
-#define REDC_1_TO_REDC_N_THRESHOLD 90
-
-#define MU_DIV_QR_THRESHOLD 998
-#define MU_DIVAPPR_Q_THRESHOLD 667
-#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1787
-#define MU_BDIV_Q_THRESHOLD 2130
-
-#define POWM_SEC_TABLE 1,16,126,480,1317
-
-#define MATRIX22_STRASSEN_THRESHOLD 21
-#define HGCD_THRESHOLD 61
-#define HGCD_APPR_THRESHOLD 56
-#define HGCD_REDUCE_THRESHOLD 3810
-#define GCD_DC_THRESHOLD 283
-#define GCDEXT_DC_THRESHOLD 309
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 399
-#define SET_STR_PRECOMPUTE_THRESHOLD 1183
-
-#define FAC_DSC_THRESHOLD 194
-#define FAC_ODD_THRESHOLD 34
diff --git a/gmp/mpn/x86/darwin.m4 b/gmp/mpn/x86/darwin.m4
index f8363db3f7..7ef8dfc105 100644
--- a/gmp/mpn/x86/darwin.m4
+++ b/gmp/mpn/x86/darwin.m4
@@ -1,82 +1,40 @@
divert(-1)
-dnl Copyright 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2007 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`DARWIN')
-
dnl Usage LEA(symbol,reg)
dnl
-dnl We maintain lists of stuff to append in load_eip and darwin_bd. The
-dnl `index' stuff is needed to suppress repeated definitions. To avoid
-dnl getting fooled by "var" and "var1", we add 'bol ' (the end of
-dnl 'indirect_symbol') at the beginning and and a newline at the end. This
-dnl might be a bit fragile.
+dnl FIXME: Only handles one symbol per assembly file because of the
+dnl way EPILOGUE_cpu is handled.
-define(`LEA',
-m4_assert_numargs(2)
-`ifdef(`PIC',`
-ifelse(index(defn(`load_eip'), `$2'),-1,
-`m4append(`load_eip',
-`L(movl_eip_`'substr($2,1)):
+define(`LEA',`
+define(`EPILOGUE_cpu',
+` L(movl_eip_`'substr($2,1)):
movl (%esp), $2
ret_internal
-')')
-ifelse(index(defn(`darwin_bd'), `bol $1
-'),-1,
-`m4append(`darwin_bd',
-` .section __IMPORT,__pointers,non_lazy_symbol_pointers
+ .section __IMPORT,__pointers,non_lazy_symbol_pointers
L($1`'$non_lazy_ptr):
.indirect_symbol $1
.long 0
-')')
+')
call L(movl_eip_`'substr($2,1))
movl L($1`'$non_lazy_ptr)-.($2), $2
-',`
- movl `$'$1, $2
-')')
-
-
-dnl EPILOGUE_cpu
-
-define(`EPILOGUE_cpu',`load_eip`'darwin_bd')
-
-define(`load_eip', `') dnl updated in LEA
-define(`darwin_bd', `') dnl updated in LEA
-
-
-dnl Usage: CALL(funcname)
-dnl
-
-define(`CALL',
-m4_assert_numargs(1)
-`call GSYM_PREFIX`'$1')
-
-undefine(`PIC_WITH_EBX')
+')
divert`'dnl
diff --git a/gmp/mpn/x86/dive_1.asm b/gmp/mpn/x86/dive_1.asm
index 9a6cbb7931..d2d02f9f72 100644
--- a/gmp/mpn/x86/dive_1.asm
+++ b/gmp/mpn/x86/dive_1.asm
@@ -1,32 +1,21 @@
dnl x86 mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -108,7 +97,7 @@ ifdef(`PIC',`
subl %edx, %eax C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
imull PARAM_DIVISOR, %eax
cmpl $1, %eax
diff --git a/gmp/mpn/x86/divrem_1.asm b/gmp/mpn/x86/divrem_1.asm
index 255d4935c3..a5fb88071d 100644
--- a/gmp/mpn/x86/divrem_1.asm
+++ b/gmp/mpn/x86/divrem_1.asm
@@ -1,32 +1,22 @@
dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
-dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/divrem_2.asm b/gmp/mpn/x86/divrem_2.asm
index 4c38ad0acb..bbadda921c 100644
--- a/gmp/mpn/x86/divrem_2.asm
+++ b/gmp/mpn/x86/divrem_2.asm
@@ -3,30 +3,19 @@ dnl x86 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -92,7 +81,7 @@ PROLOGUE(mpn_divrem_2)
seta %dl
cmp 20(%esp), %ebp
setae %al
- orb %dl, %al C "orb" form to placate Sun tools
+ or %dl, %al
jne L(35)
L(8):
mov 60(%esp), %esi C fn
@@ -185,7 +174,7 @@ L(9): mov 64(%esp), %esi C up
L(fix): seta %dl
cmp 20(%esp), %ebp
setae %al
- orb %dl, %al C "orb" form to placate Sun tools
+ or %dl, %al
je L(bck)
inc %edi
sub 20(%esp), %ebp
diff --git a/gmp/mpn/x86/fat/com.c b/gmp/mpn/x86/fat/com.c
deleted file mode 100644
index d359d4ce73..0000000000
--- a/gmp/mpn/x86/fat/com.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_com.
-
-Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/com.c"
diff --git a/gmp/mpn/x86/fat/diveby3.c b/gmp/mpn/x86/fat/diveby3.c
new file mode 100644
index 0000000000..7ea0161b72
--- /dev/null
+++ b/gmp/mpn/x86/fat/diveby3.c
@@ -0,0 +1,21 @@
+/* Fat binary fallback mpn_divexact_by3c.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/diveby3.c"
diff --git a/gmp/mpn/x86/fat/fat.c b/gmp/mpn/x86/fat/fat.c
index 1740813886..c3d1866c69 100644
--- a/gmp/mpn/x86/fat/fat.c
+++ b/gmp/mpn/x86/fat/fat.c
@@ -4,33 +4,22 @@
THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
COMPLETELY IN FUTURE GNU MP RELEASES.
-Copyright 2003, 2004, 2011, 2012 Free Software Foundation, Inc.
+Copyright 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include <stdio.h> /* for printf */
#include <stdlib.h> /* for getenv */
@@ -42,10 +31,14 @@ see https://www.gnu.org/licenses/. */
/* Change this to "#define TRACE(x) x" for some traces. */
#define TRACE(x)
+/* Change this to 1 to take the cpuid from GMP_CPU_TYPE env var. */
+#define WANT_FAKE_CPUID 0
+
/* fat_entry.asm */
-long __gmpn_cpuid (char [12], int);
-int __gmpn_cpuid_available (void);
+long __gmpn_cpuid __GMP_PROTO ((char dst[12], int id));
+int __gmpn_cpuid_available __GMP_PROTO ((void));
+
#if WANT_FAKE_CPUID
@@ -56,9 +49,8 @@ int __gmpn_cpuid_available (void);
#define __gmpn_cpuid fake_cpuid
#define __gmpn_cpuid_available fake_cpuid_available
-#define MAKE_FMS(family, model) \
- ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \
- + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12))
+#define MAKE_FMS(family, model) \
+ (((family) << 8) + ((model << 4)))
static struct {
const char *name;
@@ -72,29 +64,17 @@ static struct {
{ "pentiumpro", "GenuineIntel", MAKE_FMS (6, 0) },
{ "pentium2", "GenuineIntel", MAKE_FMS (6, 2) },
{ "pentium3", "GenuineIntel", MAKE_FMS (6, 7) },
- { "pentium4", "GenuineIntel", MAKE_FMS (15, 2) },
- { "prescott", "GenuineIntel", MAKE_FMS (15, 3) },
- { "nocona", "GenuineIntel", MAKE_FMS (15, 4) },
- { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) },
- { "coreinhm", "GenuineIntel", MAKE_FMS (6, 0x1a) },
- { "coreiwsm", "GenuineIntel", MAKE_FMS (6, 0x25) },
- { "coreisbr", "GenuineIntel", MAKE_FMS (6, 0x2a) },
- { "coreihwl", "GenuineIntel", MAKE_FMS (6, 0x3c) },
- { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) },
+ { "pentium4", "GenuineIntel", MAKE_FMS (7, 0) },
{ "k5", "AuthenticAMD", MAKE_FMS (5, 0) },
{ "k6", "AuthenticAMD", MAKE_FMS (5, 3) },
{ "k62", "AuthenticAMD", MAKE_FMS (5, 8) },
{ "k63", "AuthenticAMD", MAKE_FMS (5, 9) },
{ "athlon", "AuthenticAMD", MAKE_FMS (6, 0) },
- { "k8", "AuthenticAMD", MAKE_FMS (15, 0) },
- { "k10", "AuthenticAMD", MAKE_FMS (16, 0) },
- { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) },
- { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) },
+ { "x86_64", "AuthenticAMD", MAKE_FMS (15, 0) },
{ "viac3", "CentaurHauls", MAKE_FMS (6, 0) },
{ "viac32", "CentaurHauls", MAKE_FMS (6, 9) },
- { "nano", "CentaurHauls", MAKE_FMS (6, 15) },
};
static int
@@ -148,46 +128,28 @@ typedef DECL_preinv_mod_1 ((*preinv_mod_1_t));
struct cpuvec_t __gmpn_cpuvec = {
__MPN(add_n_init),
- 0,
- 0,
__MPN(addmul_1_init),
- 0,
- __MPN(bdiv_dbm1c_init),
- __MPN(cnd_add_n_init),
- __MPN(cnd_sub_n_init),
- __MPN(com_init),
__MPN(copyd_init),
__MPN(copyi_init),
__MPN(divexact_1_init),
+ __MPN(divexact_by3c_init),
__MPN(divrem_1_init),
__MPN(gcd_1_init),
__MPN(lshift_init),
- __MPN(lshiftc_init),
__MPN(mod_1_init),
- __MPN(mod_1_1p_init),
- __MPN(mod_1_1p_cps_init),
- __MPN(mod_1s_2p_init),
- __MPN(mod_1s_2p_cps_init),
- __MPN(mod_1s_4p_init),
- __MPN(mod_1s_4p_cps_init),
__MPN(mod_34lsub1_init),
__MPN(modexact_1c_odd_init),
__MPN(mul_1_init),
__MPN(mul_basecase_init),
- __MPN(mullo_basecase_init),
__MPN(preinv_divrem_1_init),
__MPN(preinv_mod_1_init),
- __MPN(redc_1_init),
- __MPN(redc_2_init),
__MPN(rshift_init),
__MPN(sqr_basecase_init),
__MPN(sub_n_init),
- 0,
__MPN(submul_1_init),
0
};
-int __gmpn_cpuvec_initialized = 0;
/* The following setups start with generic x86, then overwrite with
specifics for a chip, and higher versions of that chip.
@@ -257,107 +219,21 @@ __gmpn_cpuvec_init (void)
case 6:
TRACE (printf (" p6\n"));
CPUVEC_SETUP_p6;
- switch (model)
- {
- case 0x00:
- case 0x01:
- TRACE (printf (" pentiumpro\n"));
- break;
-
- case 0x02:
- case 0x03:
- case 0x04:
- case 0x05:
- case 0x06:
- TRACE (printf (" pentium2\n"));
- CPUVEC_SETUP_p6_mmx;
- break;
-
- case 0x07:
- case 0x08:
- case 0x0a:
- case 0x0b:
- case 0x0c:
- TRACE (printf (" pentium3\n"));
- CPUVEC_SETUP_p6_mmx;
- CPUVEC_SETUP_p6_p3mmx;
- break;
-
- case 0x09: /* Banias */
- case 0x0d: /* Dothan */
- case 0x0e: /* Yonah */
- TRACE (printf (" Banias/Dothan/Yonah\n"));
- CPUVEC_SETUP_p6_mmx;
- CPUVEC_SETUP_p6_p3mmx;
- CPUVEC_SETUP_p6_sse2;
- break;
-
- case 0x0f: /* Conroe Merom Kentsfield Allendale */
- case 0x10:
- case 0x11:
- case 0x12:
- case 0x13:
- case 0x14:
- case 0x15:
- case 0x16:
- case 0x17: /* PNR Wolfdale Yorkfield */
- case 0x18:
- case 0x19:
- case 0x1d: /* PNR Dunnington */
- TRACE (printf (" Conroe\n"));
- CPUVEC_SETUP_p6_mmx;
- CPUVEC_SETUP_p6_p3mmx;
- CPUVEC_SETUP_p6_sse2;
- CPUVEC_SETUP_core2;
- break;
-
- case 0x1c: /* Atom Silverthorne */
- case 0x26: /* Atom Lincroft */
- case 0x27: /* Atom Saltwell */
- case 0x36: /* Atom Cedarview/Saltwell */
- TRACE (printf (" atom\n"));
- CPUVEC_SETUP_atom;
- CPUVEC_SETUP_atom_mmx;
- CPUVEC_SETUP_atom_sse2;
- break;
-
- case 0x1a: /* NHM Gainestown */
- case 0x1b:
- case 0x1e: /* NHM Lynnfield/Jasper */
- case 0x1f:
- case 0x20:
- case 0x21:
- case 0x22:
- case 0x23:
- case 0x24:
- case 0x25: /* WSM Clarkdale/Arrandale */
- case 0x28:
- case 0x29:
- case 0x2b:
- case 0x2c: /* WSM Gulftown */
- case 0x2e: /* NHM Beckton */
- case 0x2f: /* WSM Eagleton */
- TRACE (printf (" nehalem/westmere\n"));
- CPUVEC_SETUP_p6_mmx;
- CPUVEC_SETUP_p6_p3mmx;
- CPUVEC_SETUP_p6_sse2;
- CPUVEC_SETUP_core2;
- CPUVEC_SETUP_coreinhm;
- break;
-
- case 0x2a: /* SBR */
- case 0x2d: /* SBR-EP */
- case 0x3a: /* IBR */
- case 0x3c: /* Haswell */
- TRACE (printf (" sandybridge\n"));
+ if (model >= 2)
+ {
+ TRACE (printf (" pentium2\n"));
CPUVEC_SETUP_p6_mmx;
+ }
+ if (model >= 7)
+ {
+ TRACE (printf (" pentium3\n"));
CPUVEC_SETUP_p6_p3mmx;
+ }
+ if (model >= 0xD || model == 9)
+ {
+ TRACE (printf (" p6 with sse2\n"));
CPUVEC_SETUP_p6_sse2;
- CPUVEC_SETUP_core2;
- CPUVEC_SETUP_coreinhm;
- CPUVEC_SETUP_coreisbr;
- break;
- }
+ }
break;
case 15:
@@ -395,40 +271,13 @@ __gmpn_cpuvec_init (void)
break;
case 6:
TRACE (printf (" athlon\n"));
+ athlon:
CPUVEC_SETUP_k7;
CPUVEC_SETUP_k7_mmx;
break;
-
- case 0x0f: /* k8 */
- case 0x11: /* "fam 11h", mix of k8 and k10 */
- case 0x13: /* unknown, conservatively assume k8 */
- case 0x16: /* unknown, conservatively assume k8 */
- case 0x17: /* unknown, conservatively assume k8 */
- TRACE (printf (" k8\n"));
- CPUVEC_SETUP_k7;
- CPUVEC_SETUP_k7_mmx;
- CPUVEC_SETUP_k8;
- break;
-
- case 0x10: /* k10 */
- case 0x12: /* k10 (llano) */
- TRACE (printf (" k10\n"));
- CPUVEC_SETUP_k7;
- CPUVEC_SETUP_k7_mmx;
- break;
-
- case 0x14: /* bobcat */
- TRACE (printf (" bobcat\n"));
- CPUVEC_SETUP_k7;
- CPUVEC_SETUP_k7_mmx;
- CPUVEC_SETUP_bobcat;
- break;
-
- case 0x15: /* bulldozer */
- TRACE (printf (" bulldozer\n"));
- CPUVEC_SETUP_k7;
- CPUVEC_SETUP_k7_mmx;
- break;
+ case 15:
+ TRACE (printf (" x86_64\n"));
+ goto athlon;
}
}
else if (strcmp (vendor_string, "CentaurHauls") == 0)
@@ -441,11 +290,6 @@ __gmpn_cpuvec_init (void)
{
TRACE (printf (" viac32\n"));
}
- if (model >= 15)
- {
- TRACE (printf (" nano\n"));
- CPUVEC_SETUP_nano;
- }
break;
}
}
@@ -469,5 +313,5 @@ __gmpn_cpuvec_init (void)
/* Set this once the threshold fields are ready.
Use volatile to prevent it getting moved. */
- *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+ ((volatile struct cpuvec_t *) &__gmpn_cpuvec)->initialized = 1;
}
diff --git a/gmp/mpn/x86/fat/fat_entry.asm b/gmp/mpn/x86/fat/fat_entry.asm
index 6e3cb44dd5..bd46e4e8bd 100644
--- a/gmp/mpn/x86/fat/fat_entry.asm
+++ b/gmp/mpn/x86/fat/fat_entry.asm
@@ -1,32 +1,21 @@
dnl x86 fat binary entrypoints.
-dnl Copyright 2003, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -129,7 +118,7 @@ EPILOGUE()
L(fat_init):
C al __gmpn_cpuvec byte offset
- movzbl %al, %eax
+ movsbl %al, %eax
pushl %eax
ifdef(`PIC',`
diff --git a/gmp/mpn/x86/fat/gcd_1.c b/gmp/mpn/x86/fat/gcd_1.c
index f809bd8092..5bd000618c 100644
--- a/gmp/mpn/x86/fat/gcd_1.c
+++ b/gmp/mpn/x86/fat/gcd_1.c
@@ -5,28 +5,17 @@ Copyright 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "mpn/generic/gcd_1.c"
diff --git a/gmp/mpn/x86/fat/gmp-mparam.h b/gmp/mpn/x86/fat/gmp-mparam.h
index 3641a6bafa..9127d1425f 100644
--- a/gmp/mpn/x86/fat/gmp-mparam.h
+++ b/gmp/mpn/x86/fat/gmp-mparam.h
@@ -1,35 +1,25 @@
/* Fat binary x86 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2003, 2011 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003 Free Software Foundation,
+Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time
@@ -44,17 +34,15 @@ see https://www.gnu.org/licenses/. */
preinv. */
#define USE_PREINV_DIVREM_1 1
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
- for mpn_sqr to call the latter. */
+ for mpn_sqr_n to call the latter. */
#define SQR_BASECASE_THRESHOLD 0
/* Sensible fallbacks for these, when not taken from a cpu-specific
gmp-mparam.h. */
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 130
-#define SQR_TOOM2_THRESHOLD 30
+#define MUL_KARATSUBA_THRESHOLD 20
+#define MUL_TOOM3_THRESHOLD 130
+#define SQR_KARATSUBA_THRESHOLD 30
#define SQR_TOOM3_THRESHOLD 200
/* These are values more or less in the middle of what the typical x86 chips
diff --git a/gmp/mpn/x86/fat/lshiftc.c b/gmp/mpn/x86/fat/lshiftc.c
deleted file mode 100644
index 9ecf48978f..0000000000
--- a/gmp/mpn/x86/fat/lshiftc.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_lshiftc.
-
-Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/lshiftc.c"
diff --git a/gmp/mpn/x86/fat/mod_1.c b/gmp/mpn/x86/fat/mod_1.c
deleted file mode 100644
index 4f149cc353..0000000000
--- a/gmp/mpn/x86/fat/mod_1.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_mod_1.
-
-Copyright 2003, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/mod_1.c"
diff --git a/gmp/mpn/x86/fat/mod_1_1.c b/gmp/mpn/x86/fat/mod_1_1.c
deleted file mode 100644
index 92eaa7a87f..0000000000
--- a/gmp/mpn/x86/fat/mod_1_1.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Fat binary fallback mpn_mod_1_1p.
-
-Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/*
-PROLOGUE(mpn_mod_1_1p_cps)
-*/
-
-#define OPERATION_mod_1_1_cps 1
-#include "mpn/generic/mod_1_1.c"
diff --git a/gmp/mpn/x86/fat/mod_1_2.c b/gmp/mpn/x86/fat/mod_1_2.c
deleted file mode 100644
index 9095a61c93..0000000000
--- a/gmp/mpn/x86/fat/mod_1_2.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Fat binary fallback mpn_mod_1s_2p.
-
-Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/*
-PROLOGUE(mpn_mod_1s_2p_cps)
-*/
-
-#define OPERATION_mod_1_2_cps 1
-#include "mpn/generic/mod_1_2.c"
diff --git a/gmp/mpn/x86/fat/mod_1_4.c b/gmp/mpn/x86/fat/mod_1_4.c
deleted file mode 100644
index 51c0def443..0000000000
--- a/gmp/mpn/x86/fat/mod_1_4.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Fat binary fallback mpn_mod_1s_4p.
-
-Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-/*
-PROLOGUE(mpn_mod_1s_4p_cps)
-*/
-
-#define OPERATION_mod_1_4_cps 1
-#include "mpn/generic/mod_1_4.c"
diff --git a/gmp/mpn/x86/fat/mode1o.c b/gmp/mpn/x86/fat/mode1o.c
index 870ddb899b..a5244cae44 100644
--- a/gmp/mpn/x86/fat/mode1o.c
+++ b/gmp/mpn/x86/fat/mode1o.c
@@ -5,28 +5,17 @@ Copyright 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "mpn/generic/mode1o.c"
diff --git a/gmp/mpn/x86/fat/mullo_basecase.c b/gmp/mpn/x86/fat/mullo_basecase.c
deleted file mode 100644
index 7f86be64c5..0000000000
--- a/gmp/mpn/x86/fat/mullo_basecase.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_mullo_basecase.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/mullo_basecase.c"
diff --git a/gmp/mpn/x86/fat/redc_1.c b/gmp/mpn/x86/fat/redc_1.c
deleted file mode 100644
index 0025403353..0000000000
--- a/gmp/mpn/x86/fat/redc_1.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_redc_1.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/redc_1.c"
diff --git a/gmp/mpn/x86/fat/redc_2.c b/gmp/mpn/x86/fat/redc_2.c
deleted file mode 100644
index 1932d58323..0000000000
--- a/gmp/mpn/x86/fat/redc_2.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_redc_2.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/redc_2.c"
diff --git a/gmp/mpn/x86/geode/gmp-mparam.h b/gmp/mpn/x86/geode/gmp-mparam.h
deleted file mode 100644
index cc9c9f1789..0000000000
--- a/gmp/mpn/x86/geode/gmp-mparam.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2002, 2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* Generated by tuneup.c, 2011-01-30, gcc 3.4 */
-
-#define MOD_1_NORM_THRESHOLD 6
-#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 17
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define USE_PREINV_DIVREM_1 0
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 42
-
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 66
-#define MUL_TOOM44_THRESHOLD 105
-#define MUL_TOOM6H_THRESHOLD 141
-#define MUL_TOOM8H_THRESHOLD 212
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 33
-#define SQR_TOOM3_THRESHOLD 60
-#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 196
-#define SQR_TOOM8_THRESHOLD 292
-
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 16
-
-#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 468, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 61
-#define MUL_FFT_THRESHOLD 5504
-
-#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 396, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \
- { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
- { 287, 8}, { 575,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 61
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 37
-#define MULLO_MUL_N_THRESHOLD 10950
-
-#define DC_DIV_QR_THRESHOLD 59
-#define DC_DIVAPPR_Q_THRESHOLD 189
-#define DC_BDIV_QR_THRESHOLD 55
-#define DC_BDIV_Q_THRESHOLD 136
-
-#define INV_MULMOD_BNM1_THRESHOLD 50
-#define INV_NEWTON_THRESHOLD 183
-#define INV_APPR_THRESHOLD 181
-
-#define BINV_NEWTON_THRESHOLD 204
-#define REDC_1_TO_REDC_N_THRESHOLD 54
-
-#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 81
-#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 998
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 133
-#define GCD_DC_THRESHOLD 451
-#define GCDEXT_DC_THRESHOLD 318
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 30
-#define SET_STR_DC_THRESHOLD 547
-#define SET_STR_PRECOMPUTE_THRESHOLD 1049
diff --git a/gmp/mpn/x86/gmp-mparam.h b/gmp/mpn/x86/gmp-mparam.h
index 2cb1984889..22ee86f7e1 100644
--- a/gmp/mpn/x86/gmp-mparam.h
+++ b/gmp/mpn/x86/gmp-mparam.h
@@ -1,35 +1,24 @@
/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* Generic x86 mpn_divexact_1 is faster than generic x86 mpn_divrem_1 on all
diff --git a/gmp/mpn/x86/i486/gmp-mparam.h b/gmp/mpn/x86/i486/gmp-mparam.h
index aa7dbad45b..aaddea9f18 100644
--- a/gmp/mpn/x86/i486/gmp-mparam.h
+++ b/gmp/mpn/x86/i486/gmp-mparam.h
@@ -1,46 +1,35 @@
/* 80486 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2001-2003 Free Software Foundation, Inc.
+Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* 100MHz DX4 */
/* Generated by tuneup.c, 2003-02-13, gcc 2.95 */
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 228
+#define MUL_KARATSUBA_THRESHOLD 18
+#define MUL_TOOM3_THRESHOLD 228
#define SQR_BASECASE_THRESHOLD 13
-#define SQR_TOOM2_THRESHOLD 49
+#define SQR_KARATSUBA_THRESHOLD 49
#define SQR_TOOM3_THRESHOLD 238
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/gmp/mpn/x86/k10/gmp-mparam.h b/gmp/mpn/x86/k10/gmp-mparam.h
deleted file mode 100644
index 2a1ae5a6bb..0000000000
--- a/gmp/mpn/x86/k10/gmp-mparam.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2400 MHz K10 Barcelona */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 12
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 32
-
-#define MUL_TOOM22_THRESHOLD 24
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 189
-#define MUL_TOOM8H_THRESHOLD 430
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 82
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 112
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 38
-#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 184
-#define SQR_TOOM6_THRESHOLD 262
-#define SQR_TOOM8_THRESHOLD 369
-
-#define MULMID_TOOM42_THRESHOLD 56
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 18
-
-#define MUL_FFT_MODF_THRESHOLD 765 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 765, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 199,10}, \
- { 111,11}, { 63,10}, { 127, 9}, { 263,10}, \
- { 175,11}, { 95,10}, { 207,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 543, 8}, { 1087, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 671, 8}, \
- { 1343, 9}, { 735,11}, { 191, 9}, { 799, 8}, \
- { 1599,10}, { 415, 9}, { 863,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 543, 9}, { 1087,10}, \
- { 607, 9}, { 1215, 8}, { 2431,11}, { 319,10}, \
- { 671, 9}, { 1343,10}, { 735,12}, { 191,11}, \
- { 383,10}, { 799, 9}, { 1599,11}, { 415,10}, \
- { 863, 9}, { 1727,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215, 9}, \
- { 2431,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,10}, { 1471, 9}, { 2943, 8}, { 5887,12}, \
- { 383,11}, { 799,10}, { 1599,11}, { 863,10}, \
- { 1727,12}, { 447,11}, { 959,10}, { 1919,11}, \
- { 991,10}, { 1983,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \
- { 2943, 9}, { 5887,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \
- { 959,11}, { 1983,14}, { 255,13}, { 511,12}, \
- { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \
- { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \
- { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \
- { 1151,12}, { 2495,13}, { 1407,12}, { 2943,11}, \
- { 5887,14}, { 767,13}, { 1663,12}, { 3455,13}, \
- { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
- { 2175,12}, { 4351,13}, { 2431,14}, { 1279,13}, \
- { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 172
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 555 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 555, 5}, { 21, 6}, { 11, 5}, { 25, 6}, \
- { 13, 5}, { 27, 6}, { 27, 7}, { 15, 6}, \
- { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
- { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
- { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \
- { 111,11}, { 63,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 607,11}, { 159,10}, \
- { 319, 9}, { 671, 8}, { 1343,10}, { 351, 9}, \
- { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399, 9}, { 799, 8}, { 1599,10}, \
- { 415, 9}, { 863,11}, { 223,10}, { 479,12}, \
- { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
- { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \
- { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \
- { 735, 9}, { 1471,12}, { 191,11}, { 383,10}, \
- { 799, 9}, { 1599,11}, { 415,10}, { 863, 9}, \
- { 1727,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215, 9}, { 2431,12}, { 319,11}, \
- { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \
- { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 959,10}, \
- { 1919,11}, { 991,10}, { 1983,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
- { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \
- { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \
- { 831,11}, { 1727,10}, { 3455,12}, { 959,11}, \
- { 1983,13}, { 511,12}, { 1215,11}, { 2431,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
- { 2431,13}, { 1407,12}, { 2943,14}, { 767,13}, \
- { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
- { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \
- { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 172
-#define SQR_FFT_THRESHOLD 5504
-
-#define MULLO_BASECASE_THRESHOLD 7
-#define MULLO_DC_THRESHOLD 40
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 59
-#define DC_DIVAPPR_Q_THRESHOLD 270
-#define DC_BDIV_QR_THRESHOLD 55
-#define DC_BDIV_Q_THRESHOLD 206
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 254
-#define INV_APPR_THRESHOLD 252
-
-#define BINV_NEWTON_THRESHOLD 274
-#define REDC_1_TO_REDC_N_THRESHOLD 74
-
-#define MU_DIV_QR_THRESHOLD 1589
-#define MU_DIVAPPR_Q_THRESHOLD 1589
-#define MUPI_DIV_QR_THRESHOLD 106
-#define MU_BDIV_QR_THRESHOLD 1470
-#define MU_BDIV_Q_THRESHOLD 1558
-
-#define POWM_SEC_TABLE 1,16,114,428,1240
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 136
-#define HGCD_APPR_THRESHOLD 175
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 595
-#define GCDEXT_DC_THRESHOLD 424
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 100
-#define SET_STR_PRECOMPUTE_THRESHOLD 1360
-
-#define FAC_DSC_THRESHOLD 224
-#define FAC_ODD_THRESHOLD 29
diff --git a/gmp/mpn/x86/k6/README b/gmp/mpn/x86/k6/README
index 1d65af3851..f488cbd1d8 100644
--- a/gmp/mpn/x86/k6/README
+++ b/gmp/mpn/x86/k6/README
@@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/k6/aors_n.asm b/gmp/mpn/x86/k6/aors_n.asm
index 168f9b4ae4..09afd8f688 100644
--- a/gmp/mpn/x86/k6/aors_n.asm
+++ b/gmp/mpn/x86/k6/aors_n.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/aorsmul_1.asm b/gmp/mpn/x86/k6/aorsmul_1.asm
index eaa92ebb24..c3795e3abb 100644
--- a/gmp/mpn/x86/k6/aorsmul_1.asm
+++ b/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -1,52 +1,42 @@
dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
-C P6 model 0-8,10-12 5.94
-C P6 model 9 (Banias) 5.51
-C P6 model 13 (Dothan) 5.57
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12) 5.94
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 5.57
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6 7.65-8.5 (data dependent)
-C AMD K7
-C AMD K8
+C K6: 7.65-8.5 (data dependent)
+C K7:
+C K8:
-dnl K6: large multipliers small multipliers
+dnl K6: large multpliers small multpliers
dnl UNROLL_COUNT cycles/limb cycles/limb
dnl 4 9.5 7.78
dnl 8 9.0 7.78
@@ -257,7 +247,7 @@ C registers at the point of doing the mul for the initial two carry limbs.
C
C The add/adc for the initial carry in %esi is necessary only for the
C mpn_addmul/submul_1c entry points. Duplicating the startup code to
-C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
C idea.
dnl overlapping with parameters already fetched
diff --git a/gmp/mpn/x86/k6/cross.pl b/gmp/mpn/x86/k6/cross.pl
index fc921a56b7..cf476d603b 100755
--- a/gmp/mpn/x86/k6/cross.pl
+++ b/gmp/mpn/x86/k6/cross.pl
@@ -2,31 +2,20 @@
# Copyright 2000, 2001 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 3 of the License, or (at
+# your option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: cross.pl [filename.o]...
diff --git a/gmp/mpn/x86/k6/divrem_1.asm b/gmp/mpn/x86/k6/divrem_1.asm
index b4cea4fa2a..1c86d9bd6c 100644
--- a/gmp/mpn/x86/k6/divrem_1.asm
+++ b/gmp/mpn/x86/k6/divrem_1.asm
@@ -1,32 +1,22 @@
dnl AMD K6 mpn_divrem_1 -- mpn by limb division.
-dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/gcd_1.asm b/gmp/mpn/x86/k6/gcd_1.asm
index 0c233ff362..58aff08221 100644
--- a/gmp/mpn/x86/k6/gcd_1.asm
+++ b/gmp/mpn/x86/k6/gcd_1.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
-dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/gmp-mparam.h b/gmp/mpn/x86/k6/gmp-mparam.h
index f03f1b2d91..c04446a573 100644
--- a/gmp/mpn/x86/k6/gmp-mparam.h
+++ b/gmp/mpn/x86/k6/gmp-mparam.h
@@ -1,166 +1,68 @@
/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2004, 2009, 2010 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+/* 450MHz K6-2 */
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/* Generated by tuneup.c, 2009-01-05, gcc 3.4 */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define MUL_KARATSUBA_THRESHOLD 19
+#define MUL_TOOM3_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 104
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 105
+#define SQR_TOOM4_THRESHOLD 143
-/* 450MHz K6-2 */
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 64
+#define MULLOW_MUL_N_THRESHOLD 232
+
+#define DIV_SB_PREINV_THRESHOLD 4
+#define DIV_DC_THRESHOLD 67
+#define POWM_THRESHOLD 110
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD_THRESHOLD 195
+#define GCD_DC_THRESHOLD 602
+#define GCDEXT_DC_THRESHOLD 662
+#define JACOBI_BASE_METHOD 2
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 31
+#define GET_STR_PRECOMPUTE_THRESHOLD 52
+#define SET_STR_DC_THRESHOLD 1127
+#define SET_STR_PRECOMPUTE_THRESHOLD 1795
+
+#define MUL_FFT_TABLE { 336, 672, 1152, 3584, 10240, 24576, 163840, 393216, 0 }
+#define MUL_FFT_MODF_THRESHOLD 352
+#define MUL_FFT_THRESHOLD 7168
-#define MOD_1_NORM_THRESHOLD 12
-#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 41
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 32
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 3
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 128
-#define USE_PREINV_DIVREM_1 0
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 69
-#define MUL_TOOM44_THRESHOLD 106
-#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 199
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 32
-#define SQR_TOOM3_THRESHOLD 97
-#define SQR_TOOM4_THRESHOLD 143
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 272
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 476, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 11, 5}, { 23, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \
- { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \
- { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
- { 287,11}, { 159,10}, { 351,11}, { 191,10}, \
- { 415, 9}, { 831,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 543,11}, { 287,10}, { 575,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 415,10}, \
- { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
- { 1087,11}, { 575,12}, { 319,11}, { 703,12}, \
- { 383,11}, { 831,12}, { 447,11}, { 895,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1151,12}, { 703,13}, { 383,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1215,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 106
-#define MUL_FFT_THRESHOLD 7424
-
-#define SQR_FFT_MODF_THRESHOLD 432 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 432, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 29, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 7}, { 93, 8}, { 47, 7}, \
- { 95, 8}, { 51,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 71, 8}, \
- { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 167,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \
- { 703,11}, { 191,10}, { 415,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
- { 607,11}, { 319,10}, { 639,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 415,10}, { 831,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,12}, { 319,11}, { 703,12}, { 383,11}, \
- { 831,12}, { 447,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,12}, { 703,13}, \
- { 383,12}, { 895,14}, { 255,13}, { 511,12}, \
- { 1215,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 112
-#define SQR_FFT_THRESHOLD 7040
-
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 60
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 78
-#define DC_DIVAPPR_Q_THRESHOLD 252
-#define DC_BDIV_QR_THRESHOLD 84
-#define DC_BDIV_Q_THRESHOLD 171
-
-#define INV_MULMOD_BNM1_THRESHOLD 55
-#define INV_NEWTON_THRESHOLD 234
-#define INV_APPR_THRESHOLD 236
-
-#define BINV_NEWTON_THRESHOLD 268
-#define REDC_1_TO_REDC_N_THRESHOLD 67
-
-#define MU_DIV_QR_THRESHOLD 1308
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 134
-#define MU_BDIV_QR_THRESHOLD 1164
-#define MU_BDIV_Q_THRESHOLD 1164
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 182
-#define GCD_DC_THRESHOLD 591
-#define GCDEXT_DC_THRESHOLD 472
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 24
-#define GET_STR_PRECOMPUTE_THRESHOLD 40
-#define SET_STR_DC_THRESHOLD 834
-#define SET_STR_PRECOMPUTE_THRESHOLD 2042
+#define SQR_FFT_TABLE { 272, 672, 1408, 4608, 10240, 24576, 163840, 393216, 0 }
+#define SQR_FFT_MODF_THRESHOLD 336
+#define SQR_FFT_THRESHOLD 3840
diff --git a/gmp/mpn/x86/k6/k62mmx/copyd.asm b/gmp/mpn/x86/k6/k62mmx/copyd.asm
index f80a5a1cdb..227ed78783 100644
--- a/gmp/mpn/x86/k6/k62mmx/copyd.asm
+++ b/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -1,32 +1,21 @@
dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/k62mmx/lshift.asm b/gmp/mpn/x86/k6/k62mmx/lshift.asm
index c86575feed..e48e73e19a 100644
--- a/gmp/mpn/x86/k6/k62mmx/lshift.asm
+++ b/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl AMD K6-2 mpn_lshift -- mpn left shift.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/k62mmx/rshift.asm b/gmp/mpn/x86/k6/k62mmx/rshift.asm
index f604a7bd52..b3114d0e6e 100644
--- a/gmp/mpn/x86/k6/k62mmx/rshift.asm
+++ b/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl AMD K6-2 mpn_rshift -- mpn right shift.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mmx/com.asm b/gmp/mpn/x86/k6/mmx/com_n.asm
index b747454627..42e6ab392a 100644
--- a/gmp/mpn/x86/k6/mmx/com.asm
+++ b/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -1,32 +1,21 @@
-dnl AMD K6-2 mpn_com -- mpn bitwise one's complement.
+dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -39,7 +28,7 @@ C K6-2 1.0 1.18 1.18 1.18 cycles/limb
C K6 1.5 1.85 1.75 1.85
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C Take the bitwise ones-complement of src,size and write it to dst,size.
@@ -49,7 +38,7 @@ defframe(PARAM_DST, 4)
TEXT
ALIGN(16)
-PROLOGUE(mpn_com)
+PROLOGUE(mpn_com_n)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
diff --git a/gmp/mpn/x86/k6/mmx/dive_1.asm b/gmp/mpn/x86/k6/mmx/dive_1.asm
index b644dca8cd..9cc90d88a5 100644
--- a/gmp/mpn/x86/k6/mmx/dive_1.asm
+++ b/gmp/mpn/x86/k6/mmx/dive_1.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
-dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -128,7 +117,7 @@ Zdisp( movzbl, 0,(%eax,%ebp), %eax)
subl %ebp, %eax C inv = 2*inv - inv*inv*d
subl $1, %edx C shift amount, and clear carry
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
imull PARAM_DIVISOR, %eax
cmpl $1, %eax
diff --git a/gmp/mpn/x86/k6/mmx/logops_n.asm b/gmp/mpn/x86/k6/mmx/logops_n.asm
index e17930bb2d..a6272131a2 100644
--- a/gmp/mpn/x86/k6/mmx/logops_n.asm
+++ b/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -1,33 +1,22 @@
dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mmx/lshift.asm b/gmp/mpn/x86/k6/mmx/lshift.asm
index 45be582633..1492025171 100644
--- a/gmp/mpn/x86/k6/mmx/lshift.asm
+++ b/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_lshift -- mpn left shift.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mmx/popham.asm b/gmp/mpn/x86/k6/mmx/popham.asm
index 2b19d0b5ee..a0a651d39c 100644
--- a/gmp/mpn/x86/k6/mmx/popham.asm
+++ b/gmp/mpn/x86/k6/mmx/popham.asm
@@ -1,33 +1,22 @@
dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
dnl hamming distance.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mmx/rshift.asm b/gmp/mpn/x86/k6/mmx/rshift.asm
index cd0382f322..80cd6fb05a 100644
--- a/gmp/mpn/x86/k6/mmx/rshift.asm
+++ b/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_rshift -- mpn right shift.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mod_34lsub1.asm b/gmp/mpn/x86/k6/mod_34lsub1.asm
index 7e30503e54..a5b7ee1064 100644
--- a/gmp/mpn/x86/k6/mod_34lsub1.asm
+++ b/gmp/mpn/x86/k6/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/mode1o.asm b/gmp/mpn/x86/k6/mode1o.asm
index a13f647b81..f299877911 100644
--- a/gmp/mpn/x86/k6/mode1o.asm
+++ b/gmp/mpn/x86/k6/mode1o.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2003, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -114,7 +103,7 @@ Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits
subl %ecx, %edi C inv = 2*inv - inv*inv*d
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax
movl %esi, %eax
imull %edi, %eax
diff --git a/gmp/mpn/x86/k6/mul_1.asm b/gmp/mpn/x86/k6/mul_1.asm
index 3ef7ec24fe..e1c468fe34 100644
--- a/gmp/mpn/x86/k6/mul_1.asm
+++ b/gmp/mpn/x86/k6/mul_1.asm
@@ -1,49 +1,38 @@
dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
-C P6 model 0-8,10-12 5.5
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12) 5.5
C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 4.87
+C P6 model 13 (Dothan) 4.87
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6 6.25
-C AMD K7
-C AMD K8
+C K6: 6.25
+C K7:
+C K8:
C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
diff --git a/gmp/mpn/x86/k6/mul_basecase.asm b/gmp/mpn/x86/k6/mul_basecase.asm
index 7030001c3f..dcd4d70082 100644
--- a/gmp/mpn/x86/k6/mul_basecase.asm
+++ b/gmp/mpn/x86/k6/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/pre_mod_1.asm b/gmp/mpn/x86/k6/pre_mod_1.asm
index 34db20d386..3231539bfd 100644
--- a/gmp/mpn/x86/k6/pre_mod_1.asm
+++ b/gmp/mpn/x86/k6/pre_mod_1.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k6/sqr_basecase.asm b/gmp/mpn/x86/k6/sqr_basecase.asm
index b7ecb5cc8a..3392d38812 100644
--- a/gmp/mpn/x86/k6/sqr_basecase.asm
+++ b/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,35 +25,35 @@ C product (measured on the speed difference between 17 and 33 limbs,
C which is roughly the Karatsuba recursing range).
-dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
+dnl SQR_KARATSUBA_THRESHOLD_MAX is the maximum SQR_KARATSUBA_THRESHOLD this
dnl code supports. This value is used only by the tune program to know
dnl what it can go up to. (An attempt to compile with a bigger value will
dnl trigger some m4_assert()s in the code, making the build fail.)
dnl
dnl The value is determined by requiring the displacements in the unrolled
dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
-dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
+dnl 63, giving a maximum SQR_KARATSUBA_THRESHOLD of 66.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
dnl Allow a value from the tune program to override config.m4.
-ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
-`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
+`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
-dnl number required is determined by SQR_TOOM2_THRESHOLD, since
-dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
+dnl number required is determined by SQR_KARATSUBA_THRESHOLD, since
+dnl mpn_sqr_basecase only needs to handle sizes < SQR_KARATSUBA_THRESHOLD.
dnl
dnl The first addmul is the biggest, and this takes the second least
dnl significant limb and multiplies it by the third least significant and
-dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
-dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
+dnl up. Hence for a maximum operand size of SQR_KARATSUBA_THRESHOLD-1
+dnl limbs, UNROLL_COUNT needs to be SQR_KARATSUBA_THRESHOLD-3.
-m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
-deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
diff --git a/gmp/mpn/x86/k7/README b/gmp/mpn/x86/k7/README
index 5711b612c5..e2c5e0c18d 100644
--- a/gmp/mpn/x86/k7/README
+++ b/gmp/mpn/x86/k7/README
@@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/k7/addlsh1_n.asm b/gmp/mpn/x86/k7/addlsh1_n.asm
deleted file mode 100644
index a957b6f78e..0000000000
--- a/gmp/mpn/x86/k7/addlsh1_n.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
-C The innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers. It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 6
-C AMD K6 ?
-C AMD K7 2.5
-C AMD K8
-
-C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
-C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
-C that means we need an initial magic multiply.
-C
-C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
-C cannot do rsblsh1_n since we feed carry from the shift blocks to the
-C add/subtract blocks, which is right for addition but reversed for
-C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
-C without losing any time, since we're not issue limited but carry recurrency
-C latency.
-C
-C Breaking carry recurrency might be a good idea. We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_DBLD, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_COUNT,`PARAM_DST')
-define(VAR_TMP,`PARAM_DBLD')
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_addlsh1_n)
-deflit(`FRAME',0)
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`vp', `%ebp')
-
- mov $0x2aaaaaab, %eax
-
- push %ebx FRAME_pushl()
- mov PARAM_SIZE, %ebx C size
-
- push rp FRAME_pushl()
- mov PARAM_DST, rp
-
- mul %ebx
-
- push up FRAME_pushl()
- mov PARAM_SRC, up
-
- not %edx C count = -(size\8)-1
- mov %edx, VAR_COUNT
-
- push vp FRAME_pushl()
- mov PARAM_DBLD, vp
-
- lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
- xor %edx, %edx
- lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
- or %ebx, %ebx
- jz L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- rcr %edx C restore 1st saved carry bit
- lea 4(vp), vp
- adc (up), %eax
- lea 4(up), up
- adc %edx, %edx C save a carry bit in edx
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- dec %ebx
- mov %eax, (rp)
- lea 4(rp), rp
- jnz L(oop)
- mov vp, VAR_TMP
-L(exact):
- incl VAR_COUNT
- jz L(end)
-
- ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (vp), %eax
- adc %eax, %eax
- mov 4(vp), %ebx
- adc %ebx, %ebx
- mov 8(vp), %ecx
- adc %ecx, %ecx
-
- rcr %edx C restore 1st saved carry bit
-
- adc (up), %eax
- mov %eax, (rp)
- adc 4(up), %ebx
- mov %ebx, 4(rp)
- adc 8(up), %ecx
- mov %ecx, 8(rp)
-
- mov 12(vp), %eax
- adc %eax, %eax
- mov 16(vp), %ebx
- adc %ebx, %ebx
- mov 20(vp), %ecx
- adc %ecx, %ecx
-
- lea 24(vp), vp
- adc %edx, %edx C save a carry bit in edx
-
- adc 12(up), %eax
- mov %eax, 12(rp)
- adc 16(up), %ebx
- mov %ebx, 16(rp)
- adc 20(up), %ecx
-
- lea 24(up), up
-
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- mov %ecx, 20(rp)
- incl VAR_COUNT
- lea 24(rp), rp
- jne L(top)
-
-L(end):
- pop vp FRAME_popl()
- pop up FRAME_popl()
-
-ifdef(`CPU_P6',`
- xor %eax, %eax
- shr $1, %edx
- adc %edx, %eax
-',`
- adc $0, %edx
- mov %edx, %eax
-')
- pop rp FRAME_popl()
- pop %ebx FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/k7/aors_n.asm b/gmp/mpn/x86/k7/aors_n.asm
index 1a08072029..d84de3ee98 100644
--- a/gmp/mpn/x86/k7/aors_n.asm
+++ b/gmp/mpn/x86/k7/aors_n.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/aorsmul_1.asm b/gmp/mpn/x86/k7/aorsmul_1.asm
index eec8df6de2..b247c29131 100644
--- a/gmp/mpn/x86/k7/aorsmul_1.asm
+++ b/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -1,49 +1,39 @@
dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias) 6.5
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12)
+C P6 model 9 (Banias)
C P6 model 13 (Dothan)
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6
-C AMD K7 3.75
-C AMD K8
+C K6:
+C K7: 3.75
+C K8:
C TODO
C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/bdiv_q_1.asm b/gmp/mpn/x86/k7/bdiv_q_1.asm
deleted file mode 100644
index df3477f539..0000000000
--- a/gmp/mpn/x86/k7/bdiv_q_1.asm
+++ /dev/null
@@ -1,244 +0,0 @@
-dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C Athlon: 11.0
-C Hammer: 9.0
-
-
-C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-C The dependent chain is mul+imul+sub for 11 cycles and that speed is
-C achieved with no special effort. The load and shrld latencies are hidden
-C by out of order execution.
-C
-C It's a touch faster on size==1 to use the mul-by-inverse than divl.
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-defframe(VAR_INVERSE, -20)
-defframe(VAR_DST_END, -24)
-
-deflit(STACK_SPACE, 24)
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
- ALIGN(16)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- movl PARAM_SHIFT, %ecx C shift count
-
- movl %ebp, SAVE_EBP
- movl PARAM_SIZE, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebx, SAVE_EBX
-
- leal (%esi,%ebp,4), %esi C src end
- leal (%edi,%ebp,4), %edi C dst end
- negl %ebp C -size
-
- movl PARAM_INVERSE, %eax C inv
-
-L(common):
- movl %eax, VAR_INVERSE
- movl (%esi,%ebp,4), %eax C src[0]
-
- incl %ebp
- jz L(one)
-
- movl (%esi,%ebp,4), %edx C src[1]
-
- shrdl( %cl, %edx, %eax)
-
- movl %edi, VAR_DST_END
- xorl %ebx, %ebx
- jmp L(entry)
-
- ALIGN(8)
-L(top):
- C eax q
- C ebx carry bit, 0 or 1
- C ecx shift
- C edx
- C esi src end
- C edi dst end
- C ebp counter, limbs, negative
-
- mull PARAM_DIVISOR C carry limb in edx
-
- movl -4(%esi,%ebp,4), %eax
- movl (%esi,%ebp,4), %edi
-
- shrdl( %cl, %edi, %eax)
-
- subl %ebx, %eax C apply carry bit
- setc %bl
- movl VAR_DST_END, %edi
-
- subl %edx, %eax C apply carry limb
- adcl $0, %ebx
-
-L(entry):
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi,%ebp,4)
- incl %ebp
- jnz L(top)
-
-
- mull PARAM_DIVISOR C carry limb in edx
-
- movl -4(%esi), %eax C src high limb
- shrl %cl, %eax
- movl SAVE_ESI, %esi
-
- subl %ebx, %eax C apply carry bit
- movl SAVE_EBX, %ebx
- movl SAVE_EBP, %ebp
-
- subl %edx, %eax C apply carry limb
-
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi)
- movl SAVE_EDI, %edi
- addl $STACK_SPACE, %esp
-
- ret
-
-L(one):
- shrl %cl, %eax
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
-
- imull VAR_INVERSE, %eax
-
- movl SAVE_EBP, %ebp
-
- movl %eax, -4(%edi)
- movl SAVE_EDI, %edi
- addl $STACK_SPACE, %esp
-
- ret
-EPILOGUE()
-
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-
- ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_DIVISOR, %eax
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- movl $-1, %ecx C shift count
-
- movl %ebp, SAVE_EBP
- movl PARAM_SIZE, %ebp
-
- movl %esi, SAVE_ESI
- movl %edi, SAVE_EDI
-
- C If there's usually only one or two trailing zero bits then this
- C should be faster than bsfl.
-L(strip_twos):
- incl %ecx
- shrl %eax
- jnc L(strip_twos)
-
- movl %ebx, SAVE_EBX
- leal 1(%eax,%eax), %ebx C d without twos
- andl $127, %eax C d/2, 7 bits
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %edx)
- movzbl (%eax,%edx), %eax C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- leal (%eax,%eax), %edx C 2*inv
- movl %ebx, PARAM_DIVISOR C d without twos
-
- imull %eax, %eax C inv*inv
-
- movl PARAM_SRC, %esi
- movl PARAM_DST, %edi
-
- imull %ebx, %eax C inv*inv*d
-
- subl %eax, %edx C inv = 2*inv - inv*inv*d
- leal (%edx,%edx), %eax C 2*inv
-
- imull %edx, %edx C inv*inv
-
- leal (%esi,%ebp,4), %esi C src end
- leal (%edi,%ebp,4), %edi C dst end
- negl %ebp C -size
-
- imull %ebx, %edx C inv*inv*d
-
- subl %edx, %eax C inv = 2*inv - inv*inv*d
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- imull PARAM_DIVISOR, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- jmp L(common)
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/dive_1.asm b/gmp/mpn/x86/k7/dive_1.asm
index 8eb4f45ac0..c994e0fb06 100644
--- a/gmp/mpn/x86/k7/dive_1.asm
+++ b/gmp/mpn/x86/k7/dive_1.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -116,7 +105,7 @@ ifdef(`PIC',`
subl %edx, %eax C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
imull PARAM_DIVISOR, %eax
cmpl $1, %eax
diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm
index c7d12c83c0..f912f43730 100644
--- a/gmp/mpn/x86/k7/gcd_1.asm
+++ b/gmp/mpn/x86/k7/gcd_1.asm
@@ -1,186 +1,369 @@
-dnl x86 mpn_gcd_1 optimised for AMD K7.
+dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
-dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/bit (approx)
-C AMD K7 5.31
-C AMD K8,K9 5.33
-C AMD K10 5.30
-C AMD bd1 ?
-C AMD bobcat 7.02
-C Intel P4-2 10.1
-C Intel P4-3/4 10.0
-C Intel P6/13 5.88
-C Intel core2 6.26
-C Intel NHM 6.83
-C Intel SBR 8.50
-C Intel atom 8.90
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C TODO
-C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
-C * Stream things better through registers, avoiding some copying.
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C K7: 6.75 cycles/bit (approx) 1x1 gcd
+C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
+
+
+dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
+dnl where x is the larger of the two. See tune/README for more.
+dnl
+dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
+dnl suggests 40/7*2=11.4 but 7 seems to be about right.
+
+deflit(DIV_THRESHOLD, 7)
+
+C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C
+C This is mixed in with the code, but as per the k7 optimization manual it's
+C a full cache line and suitably aligned so it won't get swapped between
+C code and data. Having it in TEXT rather than RODATA saves needing a GOT
+C entry when PIC.
+C
+C Actually, there doesn't seem to be a measurable difference between this in
+C it's own cache line or plonked in the middle of the code. Presumably
+C since TEXT is read-only there's no worries about coherency.
+
+deflit(MASK, 63)
deflit(MAXSHIFT, 6)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-DEF_OBJECT(ctz_table,64)
+ TEXT
+ ALIGN(64)
+L(table):
.byte MAXSHIFT
forloop(i,1,MASK,
` .byte m4_count_trailing_zeros(i)
')
-END_OBJECT(ctz_table)
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`DIV_THRES_LOG2', 7)
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
+C
+
+defframe(PARAM_LIMB, 12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
-define(`up', `%edi')
-define(`n', `%esi')
-define(`v0', `%edx')
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(CALL_DIVISOR,-20)
+defframe(CALL_SIZE, -24)
+defframe(CALL_SRC, -28)
+deflit(STACK_SPACE, 28)
-ASM_START()
TEXT
ALIGN(16)
+
PROLOGUE(mpn_gcd_1)
- push %edi
- push %esi
+deflit(`FRAME',0)
+
+ ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
+ ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
+
+ movl PARAM_SRC, %eax
+ movl PARAM_LIMB, %edx
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
- mov 12(%esp), up
- mov 16(%esp), n
- mov 20(%esp), v0
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
- mov (up), %eax C U low limb
- or v0, %eax C x | y
- mov $-1, %ecx
+ movl (%eax), %esi C src low limb
+
+ifdef(`PIC',`
+ movl %edi, SAVE_EDI
+ call L(movl_eip_to_edi)
+L(here):
+ addl $L(table)-L(here), %edi
+')
+
+ movl %esi, %ebx
+ orl %edx, %esi C x|y
+ movl $-1, %ecx
L(twos):
- inc %ecx
- shr %eax
- jnc L(twos)
+ incl %ecx
+ shrl %esi
+ jnc L(twos) C 3/4 chance of x or y odd already
- shr %cl, v0
- mov %ecx, %eax C common twos
+ shrl %cl, %ebx
+ shrl %cl, %edx
+ movl %ecx, %esi C common twos
-L(divide_strip_y):
- shr v0
- jnc L(divide_strip_y)
- adc v0, v0
-
- push %eax
- push v0
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %ecx
- mov %ecx, %eax
- shr $DIV_THRES_LOG2, %ecx
- cmp %ecx, v0
- ja L(reduced)
-
- mov v0, %esi
- xor %edx, %edx
- div %esi
- mov %edx, %eax
- jmp L(reduced)
-
-L(reduce_nby1):
-ifdef(`PIC_WITH_EBX',`
- push %ebx
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
+ movl PARAM_SIZE, %ecx
+ cmpl $1, %ecx
+ ja L(divide)
+
+
+ C eax
+ C ebx x
+ C ecx
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %edx, %eax
+ cmpl %ebx, %edx
+
+ cmovb( %ebx, %eax) C swap to make x bigger than y
+ cmovb( %edx, %ebx)
+
+
+L(strip_y):
+ C eax x
+ C ebx y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ ASSERT(nz,`orl %ebx,%ebx')
+ shrl %ebx
+ jnc L(strip_y)
+ rcll %ebx
+
+
+ C eax x
+ C ebx y (odd)
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %eax, %ecx
+ movl %ebx, %edx
+ shrl $DIV_THRESHOLD, %eax
+
+ cmpl %eax, %ebx
+ movl %ecx, %eax
+ ja L(strip_x_entry) C do x%y if x much bigger than y
+
+
+ xorl %edx, %edx
+
+ divl %ebx
+
+ orl %edx, %edx
+ movl %edx, %eax C remainder -> x
+ movl %ebx, %edx C y
+
+ jz L(done_ebx)
+ jmp L(strip_x)
+
+
+ C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
+ C ensuring the end of the jnz at the end of this loop doesn't cross
+ C into the next cache line at 0xC0.
+ C
+ C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
+ C it crosses but doesn't suffer any measurable slowdown.
+
+L(top):
+ C eax x
+ C ebx y-x
+ C ecx x-y
+ C edx y
+ C esi twos, for use at end
+ C edi [PIC] L(table)
+
+ cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
+ cmovc( %eax, %edx)
+
+L(strip_x):
+ movl %ecx, %eax
+L(strip_x_entry):
+ andl $MASK, %ecx
+
+ ASSERT(nz, `orl %eax, %eax')
+
+ifdef(`PIC',`
+ movb (%ecx,%edi), %cl
+',`
+ movb L(table) (%ecx), %cl
')
- push v0 C param 3
- push n C param 2
- push up C param 1
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
- CALL( mpn_mod_1)
- jmp L(called)
-L(bmod):
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $12, %esp C deallocate params
-ifdef(`PIC_WITH_EBX',`
- pop %ebx
+
+ shrl %cl, %eax
+ cmpb $MAXSHIFT, %cl
+
+ movl %eax, %ecx
+ movl %edx, %ebx
+ je L(strip_x)
+
+ ASSERT(nz, `testl $1, %eax') C both odd
+ ASSERT(nz, `testl $1, %edx')
+
+ subl %eax, %ebx
+ subl %edx, %ecx
+ jnz L(top)
+
+
+L(done):
+ movl %esi, %ecx
+ movl SAVE_ESI, %esi
+ifdef(`PIC',`
+ movl SAVE_EDI, %edi
')
-L(reduced):
- pop %edx
-
- LEA( ctz_table, %esi)
- test %eax, %eax
- mov %eax, %ecx
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K8 BC P4 NHM SBR
-L(top): cmovc( %ecx, %eax) C if x-y < 0 0
- cmovc( %edi, %edx) C use x,y-x 0
-L(mid): and $MASK, %ecx C 0
- movzbl (%esi,%ecx), %ecx C 1
- jz L(shift_alot) C 1
- shr %cl, %eax C 3
- mov %eax, %edi C 4
- mov %edx, %ecx C 3
- sub %eax, %ecx C 4
- sub %edx, %eax C 4
- jnz L(top) C 5
-
-L(end): pop %ecx
- mov %edx, %eax
- shl %cl, %eax
- pop %esi
- pop %edi
- ret
-L(shift_alot):
- shr $MAXSHIFT, %eax
- mov %eax, %ecx
- jmp L(mid)
+ shll %cl, %eax
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
-ifdef(`PIC_WITH_EBX',`
-L(movl_eip_to_ebx):
- mov (%esp), %ebx
ret
+
+
+
+C -----------------------------------------------------------------------------
+C two or more limbs
+
+dnl MODEXACT_THRESHOLD is the size at which it's better to call
+dnl mpn_modexact_1_odd than do an inline loop.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
+
+L(divide):
+ C eax src
+ C ebx
+ C ecx size
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+L(divide_strip_y):
+ ASSERT(nz,`orl %edx,%edx')
+ shrl %edx
+ jnc L(divide_strip_y)
+ leal 1(%edx,%edx), %ebx C y now odd
+
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+ movl -4(%eax,%ecx,4), %eax C src high limb
+
+ cmp $MODEXACT_THRESHOLD, %ecx
+ jae L(modexact)
+
+ cmpl %ebx, %eax C high cmp divisor
+ movl $0, %edx
+
+ cmovc( %eax, %edx) C skip a div if high<divisor
+ sbbl $0, %ecx
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx y
+ C ecx counter (size to 1, inclusive)
+ C edx carry (remainder)
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ movl -4(%ebp,%ecx,4), %eax
+
+ divl %ebx
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+ C eax
+ C ebx y (odd)
+ C ecx
+ C edx x
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ orl %edx, %edx
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+
+ movl %edx, %ecx
+ movl %ebx, %edx
+ jnz L(strip_x_entry)
+
+
+L(done_ebx):
+ movl %ebx, %eax
+ jmp L(done)
+
+
+
+L(modexact):
+ C eax
+ C ebx y
+ C ecx size
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ifdef(`PIC',`
+ movl %ebp, CALL_SRC
+ movl %ebx, %ebp C y
+ movl %edi, %ebx C L(table)
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
+ movl %ebp, CALL_DIVISOR
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+dnl non-PIC
+ movl %ebx, CALL_DIVISOR
+ movl %ebp, CALL_SRC
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd
')
+
+ C eax x
+ C ebx [non-PIC] y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp [PIC] y
+
+ orl %eax, %eax
+ movl ifdef(`PIC',`%ebp',`%ebx'), %edx
+ movl SAVE_EBP, %ebp
+
+ movl %eax, %ecx
+ jnz L(strip_x_entry)
+
+ movl %edx, %eax
+ jmp L(done)
+
+
+ifdef(`PIC', `
+L(movl_eip_to_edi):
+ movl (%esp), %edi
+ ret_internal
+')
+
EPILOGUE()
diff --git a/gmp/mpn/x86/k7/gmp-mparam.h b/gmp/mpn/x86/k7/gmp-mparam.h
index 9977a113e2..ced0c020f7 100644
--- a/gmp/mpn/x86/k7/gmp-mparam.h
+++ b/gmp/mpn/x86/k7/gmp-mparam.h
@@ -1,241 +1,73 @@
/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2005, 2008-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free
+Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2083 MHz K7 Barton */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 3
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 24
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 85
-#define MUL_TOOM44_THRESHOLD 147
-#define MUL_TOOM6H_THRESHOLD 216
-#define MUL_TOOM8H_THRESHOLD 309
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 124
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 81
-#define SQR_TOOM4_THRESHOLD 216
-#define SQR_TOOM6_THRESHOLD 306
-#define SQR_TOOM8_THRESHOLD 446
-
-#define MULMID_TOOM42_THRESHOLD 56
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */
-#define MUL_FFT_TABLE3 \
- { { 904, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
- { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
- { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
- { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
- { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
- { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
- { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \
- { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
- { 127, 9}, { 71, 8}, { 143, 9}, { 79, 8}, \
- { 159,10}, { 47, 9}, { 95, 8}, { 191, 9}, \
- { 103,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 143,10}, { 79, 9}, { 167,10}, \
- { 95, 9}, { 199,10}, { 111,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,10}, { 207,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 8}, { 1087,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 191,10}, { 383, 9}, { 767, 8}, { 1535, 9}, \
- { 799, 8}, { 1599,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \
- { 607, 9}, { 1215, 8}, { 2431,11}, { 319,10}, \
- { 639, 9}, { 1279,10}, { 671, 9}, { 1343,12}, \
- { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \
- { 799, 9}, { 1599,10}, { 831, 9}, { 1663,10}, \
- { 863,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \
- { 1151,11}, { 607,10}, { 1215, 9}, { 2431,12}, \
- { 319,11}, { 639,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \
- { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \
- { 1663,11}, { 895,10}, { 1791,11}, { 959,10}, \
- { 1919,13}, { 255,12}, { 511,11}, { 1023,10}, \
- { 2047,11}, { 1087,12}, { 575,11}, { 1151,10}, \
- { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
- { 1279,10}, { 2559,11}, { 1407,10}, { 2815,11}, \
- { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1663,12}, { 895,11}, \
- { 1791,10}, { 3583,12}, { 959,11}, { 1919,10}, \
- { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \
- { 2047,12}, { 1087,11}, { 2175,12}, { 1151,11}, \
- { 2303,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1663,11}, { 3327,13}, { 895,12}, \
- { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
- { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
- { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
- { 4095,13}, { 2175,12}, { 4351,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2559,12}, { 5119,13}, \
- { 2943,12}, { 5887,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 228
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 888 /* k = 6 */
-#define SQR_FFT_TABLE3 \
- { { 888, 6}, { 21, 7}, { 11, 6}, { 25, 7}, \
- { 13, 6}, { 27, 7}, { 15, 6}, { 31, 7}, \
- { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
- { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
- { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
- { 39, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 47, 7}, \
- { 95, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 79,10}, { 47, 9}, { 95, 8}, { 191,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \
- { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \
- { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
- { 639, 8}, { 1279, 9}, { 671,11}, { 191,10}, \
- { 383, 9}, { 799, 8}, { 1599, 9}, { 831,11}, \
- { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 575, 9}, { 1215, 8}, \
- { 2431,11}, { 319,10}, { 639, 9}, { 1279,10}, \
- { 671, 9}, { 1407,12}, { 191,10}, { 799, 9}, \
- { 1599,10}, { 831, 9}, { 1663,10}, { 863, 9}, \
- { 1727,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087, 9}, \
- { 2175,10}, { 1119,11}, { 575,10}, { 1151,11}, \
- { 607,10}, { 1215, 9}, { 2431,12}, { 319,11}, \
- { 639,10}, { 1279,11}, { 671,10}, { 1343, 9}, \
- { 2687,11}, { 703,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,10}, { 1503,12}, { 383,11}, \
- { 767,10}, { 1535,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 895,10}, \
- { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \
- { 511,11}, { 1023,10}, { 2047,11}, { 1087,10}, \
- { 2175,11}, { 1119,12}, { 575,11}, { 1151,10}, \
- { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
- { 1407,10}, { 2815,11}, { 1471,10}, { 2943,12}, \
- { 767,11}, { 1599,12}, { 831,11}, { 1663,10}, \
- { 3327,12}, { 895,11}, { 1791,10}, { 3583,12}, \
- { 959,11}, { 1919,10}, { 3839,11}, { 1983,14}, \
- { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
- { 1087,11}, { 2175,12}, { 1151,11}, { 2303,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1407,11}, \
- { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1663,11}, { 3327,12}, { 1727,13}, { 895,12}, \
- { 1791,11}, { 3583,12}, { 1919,11}, { 3839,12}, \
- { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
- { 2175,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \
- { 3327,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3967,15}, { 511,14}, { 1023,13}, { 2047,12}, \
- { 4095,13}, { 2175,12}, { 4351,13}, { 2431,14}, \
- { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 229
-#define SQR_FFT_THRESHOLD 7552
-
-#define MULLO_BASECASE_THRESHOLD 8
-#define MULLO_DC_THRESHOLD 36
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 45
-#define DC_DIVAPPR_Q_THRESHOLD 208
-#define DC_BDIV_QR_THRESHOLD 43
-#define DC_BDIV_Q_THRESHOLD 140
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 204
-#define INV_APPR_THRESHOLD 204
-
-#define BINV_NEWTON_THRESHOLD 230
-#define REDC_1_TO_REDC_N_THRESHOLD 59
-
-#define MU_DIV_QR_THRESHOLD 1752
-#define MU_DIVAPPR_Q_THRESHOLD 1528
-#define MUPI_DIV_QR_THRESHOLD 82
-#define MU_BDIV_QR_THRESHOLD 1360
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,16,102,336,1221
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 120
-#define HGCD_APPR_THRESHOLD 143
-#define HGCD_REDUCE_THRESHOLD 4818
-#define GCD_DC_THRESHOLD 474
-#define GCDEXT_DC_THRESHOLD 345
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 33
-#define SET_STR_DC_THRESHOLD 298
-#define SET_STR_PRECOMPUTE_THRESHOLD 1187
-
-#define FAC_DSC_THRESHOLD 602
-#define FAC_ODD_THRESHOLD 29
+/* 2083 MHz Athlon */
+
+/* Generated by tuneup.c, 2008-12-23, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 28
+#define MUL_TOOM3_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 52
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 196
+
+#define MULLOW_BASECASE_THRESHOLD 10
+#define MULLOW_DC_THRESHOLD 96
+#define MULLOW_MUL_N_THRESHOLD 234
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 86
+#define POWM_THRESHOLD 134
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD_THRESHOLD 163
+#define GCD_DC_THRESHOLD 665
+#define GCDEXT_DC_THRESHOLD 605
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 19
+#define GET_STR_PRECOMPUTE_THRESHOLD 35
+#define SET_STR_DC_THRESHOLD 826
+#define SET_STR_PRECOMPUTE_THRESHOLD 1691
+
+#define MUL_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 163840, 655360, 0 }
+#define MUL_FFT_MODF_THRESHOLD 496
+#define MUL_FFT_THRESHOLD 4864
+
+#define SQR_FFT_TABLE { 432, 864, 1664, 4608, 10240, 40960, 98304, 655360, 0 }
+#define SQR_FFT_MODF_THRESHOLD 432
+#define SQR_FFT_THRESHOLD 3840
+
+/* These tables need to be updated. */
+
+#define MUL_FFT_TABLE2 {{1, 4}, {401, 5}, {801, 6}, {817, 5}, {865, 6}, {1025, 5}, {1057, 6}, {1601, 7}, {1633, 6}, {1729, 7}, {1921, 6}, {2113, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {2945, 6}, {3009, 7}, {3457, 8}, {3521, 7}, {4481, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6017, 8}, {7553, 9}, {7681, 8}, {9985, 9}, {11777, 8}, {13057, 9}, {13825, 8}, {14081, 9}, {15873, 8}, {16641, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24577, 9}, {25089, 8}, {25345, 9}, {27393, 10}, {27649, 9}, {28161, 10}, {31745, 9}, {38913, 10}, {39425, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {98305, 11}, {99329, 10}, {100353, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {110593, 11}, {112641, 10}, {113665, 11}, {129025, 10}, {162817, 11}, {194561, 10}, {195585, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4976641, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1, 4}, {401, 5}, {417, 4}, {433, 5}, {881, 6}, {961, 5}, {993, 6}, {1857, 7}, {1921, 6}, {2049, 7}, {2177, 6}, {2241, 7}, {2433, 6}, {2497, 7}, {3457, 8}, {3841, 7}, {4481, 8}, {4609, 7}, {4737, 8}, {4865, 7}, {5249, 8}, {5889, 7}, {6273, 8}, {7041, 9}, {7681, 8}, {9985, 9}, {10241, 8}, {10497, 9}, {11777, 8}, {13057, 9}, {15873, 8}, {16385, 9}, {16897, 8}, {17153, 9}, {19969, 8}, {20225, 9}, {20737, 8}, {20993, 9}, {24065, 8}, {24321, 9}, {24577, 10}, {24833, 9}, {25601, 10}, {27137, 9}, {27649, 10}, {31745, 9}, {38401, 10}, {38913, 9}, {40449, 10}, {48129, 9}, {48641, 11}, {63489, 10}, {99329, 11}, {101377, 10}, {103425, 11}, {104449, 10}, {107521, 11}, {110593, 10}, {113665, 11}, {129025, 10}, {154625, 11}, {155649, 10}, {162817, 11}, {194561, 12}, {258049, 11}, {391169, 12}, {520193, 11}, {718849, 12}, {727041, 11}, {729089, 12}, {782337, 11}, {849921, 13}, {1040385, 12}, {2879489, 13}, {3137537, 12}, {3928065, 13}, {4186113, 12}, {4714497, 13}, {5234689, 12}, {6025217, 13}, {6283265, 12}, {7073793, 13}, {7331841, 12}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/x86/k7/invert_limb.asm b/gmp/mpn/x86/k7/invert_limb.asm
deleted file mode 100644
index 6cce455a9d..0000000000
--- a/gmp/mpn/x86/k7/invert_limb.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl x86 mpn_invert_limb
-
-dnl Contributed to the GNU project by Niels Möller
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles (approx) div
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 41 53
-C AMD K8 ?
-
-C TODO
-C * These c/l numbers are for a non-PIC build. Consider falling back to using
-C the 'div' instruction for PIC builds.
-C * Perhaps use this file--or at least the algorithm--for more machines than k7.
-
-C Register usage:
-C Input D in %edi
-C Current approximation is in %eax and/or %ecx
-C %ebx and %edx are temporaries
-C %esi and %ebp are unused
-
-defframe(PARAM_DIVISOR,4)
-
-ASM_START()
-
-C Make approx_tab global to work around Apple relocation bug.
-ifdef(`DARWIN',`
- deflit(`approx_tab', MPN(invert_limb_tab))
- GLOBL approx_tab')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_invert_limb)
-deflit(`FRAME', 0)
- mov PARAM_DIVISOR, %eax
- C Avoid push/pop on k7.
- sub $8, %esp FRAME_subl_esp(8)
- mov %ebx, (%esp)
- mov %edi, 4(%esp)
-
- mov %eax, %edi
- shr $22, %eax
-ifdef(`PIC',`
- LEA( approx_tab, %ebx)
- movzwl -1024(%ebx, %eax, 2), %eax
-',`
- movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
-')
-
- C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
- mov %eax, %ecx
- imul %eax, %eax
- mov %edi, %ebx
- shr $11, %ebx
- inc %ebx
- mul %ebx
- mov %edi, %ebx C Prepare
- shr %ebx
- sbb %eax, %eax
- sub %eax, %ebx C %ebx = d_31, %eax = mask
- shl $4, %ecx
- dec %ecx
- sub %edx, %ecx C %ecx = v1
-
- C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
- imul %ecx, %ebx
- and %ecx, %eax
- shr %eax
- sub %ebx, %eax
- mul %ecx
- mov %edi, %eax C Prepare for next mul
- shl $15, %ecx
- shr %edx
- add %edx, %ecx C %ecx = v2
-
- mul %ecx
- add %edi, %eax
- mov %ecx, %eax
- adc %edi, %edx
- sub %edx, %eax C %eax = v3
-
- mov (%esp), %ebx
- mov 4(%esp), %edi
- add $8, %esp
-
- ret
-
-EPILOGUE()
-
-DEF_OBJECT(approx_tab,2)
- .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
- .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
- .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
- .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
- .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
- .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
- .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
- .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
- .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
- .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
- .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
- .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
- .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
- .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
- .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
- .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
- .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
- .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
- .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
- .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
- .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
- .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
- .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
- .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
- .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
- .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
- .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
- .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
- .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
- .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
- .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
- .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
- .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
- .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
- .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
- .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
- .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
- .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
- .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
- .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
- .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
- .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
- .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
- .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
- .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
- .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
- .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
- .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
- .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
- .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
- .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
- .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
- .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
- .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
- .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
- .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
- .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
- .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
- .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
- .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
- .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
- .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
- .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
- .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
-END_OBJECT(approx_tab)
diff --git a/gmp/mpn/x86/k7/mmx/com.asm b/gmp/mpn/x86/k7/mmx/com_n.asm
index a258c224f1..068c01f076 100644
--- a/gmp/mpn/x86/k7/mmx/com.asm
+++ b/gmp/mpn/x86/k7/mmx/com_n.asm
@@ -1,32 +1,21 @@
-dnl AMD Athlon mpn_com -- mpn bitwise one's complement.
+dnl AMD Athlon mpn_com_n -- mpn bitwise one's complement.
dnl Copyright 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -34,7 +23,7 @@ include(`../config.m4')
C K7: 1.0 cycles/limb
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C The loop form below is necessary for the claimed speed. It needs to be
C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it
@@ -62,7 +51,7 @@ defframe(PARAM_DST, 4)
TEXT
ALIGN(16)
-PROLOGUE(mpn_com)
+PROLOGUE(mpn_com_n)
deflit(`FRAME',0)
movl PARAM_DST, %edx
diff --git a/gmp/mpn/x86/k7/mmx/copyd.asm b/gmp/mpn/x86/k7/mmx/copyd.asm
index 59ece40920..4601fcd75a 100644
--- a/gmp/mpn/x86/k7/mmx/copyd.asm
+++ b/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/copyi.asm b/gmp/mpn/x86/k7/mmx/copyi.asm
index 9a28f927ec..a17d575ff4 100644
--- a/gmp/mpn/x86/k7/mmx/copyi.asm
+++ b/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/divrem_1.asm b/gmp/mpn/x86/k7/mmx/divrem_1.asm
index cf343280bb..fa5824c7b9 100644
--- a/gmp/mpn/x86/k7/mmx/divrem_1.asm
+++ b/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -1,33 +1,22 @@
dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
dnl division.
-dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -456,7 +445,7 @@ C chain, and nothing better than 18 cycles has been found when using it.
C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
C be an extremely rare event.
C
-C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
C if some special data is coming out with this always, the q1_ff special
C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
C induce the q1_ff case, for speed measurements or testing. Note that
@@ -735,12 +724,12 @@ C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
C rnd() means rounding down to a multiple of d.
C
C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
-C = m*d + b*d - m - b
-C = floor((b(b-d)-1)/d)*d + b*d - m - b
-C = rnd(b(b-d)-1) + b*d - m - b
-C = rnd(b(b-d)-1 + b*d) - m - b
-C = rnd(b*b-1) - m - b
-C <= (b-2)*b
+C = m*d + b*d - m - b
+C = floor((b(b-d)-1)/d)*d + b*d - m - b
+C = rnd(b(b-d)-1) + b*d - m - b
+C = rnd(b(b-d)-1 + b*d) - m - b
+C = rnd(b*b-1) - m - b
+C <= (b-2)*b
C
C Unchanged from the general case is that the final quotient limb q can be
C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
diff --git a/gmp/mpn/x86/k7/mmx/lshift.asm b/gmp/mpn/x86/k7/mmx/lshift.asm
index b3383cf2c3..b3bff8ffd1 100644
--- a/gmp/mpn/x86/k7/mmx/lshift.asm
+++ b/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_lshift -- mpn left shift.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mmx/mod_1.asm b/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..2b42e55caf
--- /dev/null
+++ b/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,509 @@
+dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+ movl PARAM_SRC, %ecx
+ movl PARAM_SIZE, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_INVERSE, %edx
+
+ movl %esi, SAVE_ESI
+ movl -4(%ecx,%eax,4), %edi C src high limb
+ leal -16(%ecx,%eax,4), %ecx C &src[size-4]
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_INVERSE, %edx
+
+ movl $0, VAR_NORM C l==0
+
+ movl %edi, %esi
+ subl %ebp, %edi C high-divisor
+
+ cmovc( %esi, %edi) C restore if underflow
+ decl %eax
+ jz L(done_edi) C size==1, high-divisor only
+
+ movl 8(%ecx), %esi C src second high limb
+ movl %edx, VAR_INVERSE
+
+ movl $32, %ebx C 32-l
+ decl %eax
+ jz L(inverse_one_left) C size==2, one divide
+
+ movd %ebx, %mm7 C 32-l
+ decl %eax
+ jz L(inverse_two_left) C size==3, two divides
+
+ jmp L(inverse_top) C size>=4
+
+
+L(done_edi):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl %edi, %eax
+
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+
+
+ ALIGN(32)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C carry flag if high<divisor
+
+ cmovc( %eax, %edx) C src high limb as initial carry
+ sbbl $0, %ecx C size-1 to skip one div
+ jz L(divide_done)
+
+
+ ALIGN(16)
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ movl %edx, %eax
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ movl %ebx, SAVE_EBX
+ movl %ecx, %ebx C size
+
+ movl %edi, SAVE_EDI
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C 32-l
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+ C
+
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl 8(%eax), %esi C src high limb
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ movl %eax, %ecx C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+ cmpl $2, %ebx
+ je L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa. They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary. Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+ ALIGN(32)
+L(inverse_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx src pointer, decrementing
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SIZE, %ebx C dummy
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next src limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, PARAM_SIZE C dummy
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ nop C dummy
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+ leal (%ecx), %ecx C dummy
+
+ C
+
+ C
+
+ subl %eax, %esi C low n - (q1+1)*d
+ movl PARAM_SRC, %eax
+
+ C
+
+ sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ cmpl %eax, %ecx
+ jae L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx &src[-1]
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src dword)
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd 4(%ecx), %mm0 C src low limb
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+
+L(inverse_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 src limb, shifted
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movl VAR_NORM, %ecx C for final denorm
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ movl SAVE_EBX, %ebx
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ movl %esi, %eax C remainder
+ movl SAVE_ESI, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ leal (%ebp,%eax), %edx
+ movl SAVE_EBP, %ebp
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ movl SAVE_EDI, %edi
+
+ shrl %cl, %eax C denorm remainder
+ addl $STACK_SPACE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx src pointer
+ C edx
+ C esi n10
+ C edi (n2)
+ C ebp divisor
+
+ movl PARAM_SRC, %edx
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi C next n10
+
+ cmpl %edx, %ecx
+ jae L(inverse_top)
+ jmp L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mmx/popham.asm b/gmp/mpn/x86/k7/mmx/popham.asm
index 95965b74d4..5dc0a78c42 100644
--- a/gmp/mpn/x86/k7/mmx/popham.asm
+++ b/gmp/mpn/x86/k7/mmx/popham.asm
@@ -1,40 +1,29 @@
dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
dnl distance.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C popcount hamdist
C P3 generic 6.5 7
-C P3 model 9 (Banias) 5.7 6.1
+C P3 model 9 (Banias) ? ?
C P3 model 13 (Dothan) 5.75 6
C K7 5 6
diff --git a/gmp/mpn/x86/k7/mmx/rshift.asm b/gmp/mpn/x86/k7/mmx/rshift.asm
index 345d23a25e..3566ce85d7 100644
--- a/gmp/mpn/x86/k7/mmx/rshift.asm
+++ b/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_rshift -- mpn right shift.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mod_1_1.asm b/gmp/mpn/x86/k7/mod_1_1.asm
deleted file mode 100644
index 1bbe6f92d7..0000000000
--- a/gmp/mpn/x86/k7/mod_1_1.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-dnl x86-32 mpn_mod_1_1p, requiring cmov.
-
-dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
-
-dnl Copyright 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) ?
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 7
-C AMD K8 ?
-
-define(`B2mb', `%ebx')
-define(`r0', `%esi')
-define(`r2', `%ebp')
-define(`t0', `%edi')
-define(`ap', `%ecx') C Also shift count
-
-C Stack frame
-C pre 36(%esp)
-C b 32(%esp)
-C n 28(%esp)
-C ap 24(%esp)
-C return 20(%esp)
-C %ebp 16(%esp)
-C %edi 12(%esp)
-C %esi 8(%esp)
-C %ebx 4(%esp)
-C B2mod (%esp)
-
-define(`B2modb', `(%esp)')
-define(`n', `28(%esp)')
-define(`b', `32(%esp)')
-define(`pre', `36(%esp)')
-
-C mp_limb_t
-C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
-C
-C The pre array contains bi, cnt, B1modb, B2modb
-C Note: This implementation needs B1modb only when cnt > 0
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_mod_1_1p)
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 32(%esp), %ebp C pre[]
-
- mov 12(%ebp), %eax C B2modb
- push %eax C Put it on stack
-
- mov n, %edx
- mov 24(%esp), ap
-
- lea (ap, %edx, 4), ap
- mov -4(ap), %eax
- cmp $3, %edx
- jnc L(first)
- mov -8(ap), r0
- jmp L(reduce_two)
-
-L(first):
- C First iteration, no r2
- mull B2modb
- mov -12(ap), r0
- add %eax, r0
- mov -8(ap), %eax
- adc %edx, %eax
- sbb r2, r2
- subl $3, n
- lea -16(ap), ap
- jz L(reduce_three)
-
- mov B2modb, B2mb
- sub b, B2mb
- lea (B2mb, r0), t0
- jmp L(mid)
-
- ALIGN(16)
-L(top): C Loopmixed to 7 c/l on k7
- add %eax, r0
- lea (B2mb, r0), t0
- mov r2, %eax
- adc %edx, %eax
- sbb r2, r2
-L(mid): mull B2modb
- and B2modb, r2
- add r0, r2
- decl n
- mov (ap), r0
- cmovc( t0, r2)
- lea -4(ap), ap
- jnz L(top)
-
- add %eax, r0
- mov r2, %eax
- adc %edx, %eax
- sbb r2, r2
-
-L(reduce_three):
- C Eliminate r2
- and b, r2
- sub r2, %eax
-
-L(reduce_two):
- mov pre, %ebp
- movb 4(%ebp), %cl
- test %cl, %cl
- jz L(normalized)
-
- C Unnormalized, use B1modb to reduce to size < B b
- mull 8(%ebp)
- xor t0, t0
- add %eax, r0
- adc %edx, t0
- mov t0, %eax
-
- C Left-shift to normalize
- shld %cl, r0, %eax C Always use shld?
-
- shl %cl, r0
- jmp L(udiv)
-
-L(normalized):
- mov %eax, t0
- sub b, t0
- cmovnc( t0, %eax)
-
-L(udiv):
- lea 1(%eax), t0
- mull (%ebp)
- mov b, %ebx C Needed in register for lea
- add r0, %eax
- adc t0, %edx
- imul %ebx, %edx
- sub %edx, r0
- cmp r0, %eax
- lea (%ebx, r0), %eax
- cmovnc( r0, %eax)
- cmp %ebx, %eax
- jnc L(fix)
-L(ok): shr %cl, %eax
-
- add $4, %esp
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
-
- ret
-L(fix): sub %ebx, %eax
- jmp L(ok)
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
- push %ebp
- mov 12(%esp), %ebp
- push %esi
- bsr %ebp, %ecx
- push %ebx
- xor $31, %ecx
- mov 16(%esp), %esi
- sal %cl, %ebp
- mov %ebp, %edx
- not %edx
- mov $-1, %eax
- div %ebp C On K7, invert_limb would be a few cycles faster.
- mov %eax, (%esi) C store bi
- mov %ecx, 4(%esi) C store cnt
- neg %ebp
- mov $1, %edx
- shld %cl, %eax, %edx
- imul %ebp, %edx
- shr %cl, %edx
- imul %ebp, %eax
- mov %edx, 8(%esi) C store B1modb
- mov %eax, 12(%esi) C store B2modb
- pop %ebx
- pop %esi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_1_4.asm b/gmp/mpn/x86/k7/mod_1_4.asm
deleted file mode 100644
index bb7597edd2..0000000000
--- a/gmp/mpn/x86/k7/mod_1_4.asm
+++ /dev/null
@@ -1,260 +0,0 @@
-dnl x86-32 mpn_mod_1s_4p, requiring cmov.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 6
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) 15.5
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 ?
-C AMD K7 4.75
-C AMD K8 ?
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
- push %ebp
- push %edi
- push %esi
- push %ebx
- sub $28, %esp
- mov 60(%esp), %edi C cps[]
- mov 8(%edi), %eax
- mov 12(%edi), %edx
- mov 16(%edi), %ecx
- mov 20(%edi), %esi
- mov 24(%edi), %edi
- mov %eax, 4(%esp)
- mov %edx, 8(%esp)
- mov %ecx, 12(%esp)
- mov %esi, 16(%esp)
- mov %edi, 20(%esp)
- mov 52(%esp), %eax C n
- xor %edi, %edi
- mov 48(%esp), %esi C up
- lea -12(%esi,%eax,4), %esi
- and $3, %eax
- je L(b0)
- cmp $2, %eax
- jc L(b1)
- je L(b2)
-
-L(b3): mov 4(%esi), %eax
- mull 4(%esp)
- mov (%esi), %ebp
- add %eax, %ebp
- adc %edx, %edi
- mov 8(%esi), %eax
- mull 8(%esp)
- lea -12(%esi), %esi
- jmp L(m0)
-
-L(b0): mov (%esi), %eax
- mull 4(%esp)
- mov -4(%esi), %ebp
- add %eax, %ebp
- adc %edx, %edi
- mov 4(%esi), %eax
- mull 8(%esp)
- add %eax, %ebp
- adc %edx, %edi
- mov 8(%esi), %eax
- mull 12(%esp)
- lea -16(%esi), %esi
- jmp L(m0)
-
-L(b1): mov 8(%esi), %ebp
- lea -4(%esi), %esi
- jmp L(m1)
-
-L(b2): mov 8(%esi), %edi
- mov 4(%esi), %ebp
- lea -8(%esi), %esi
- jmp L(m1)
-
- ALIGN(16)
-L(top): mov (%esi), %eax
- mull 4(%esp)
- mov -4(%esi), %ebx
- xor %ecx, %ecx
- add %eax, %ebx
- adc %edx, %ecx
- mov 4(%esi), %eax
- mull 8(%esp)
- add %eax, %ebx
- adc %edx, %ecx
- mov 8(%esi), %eax
- mull 12(%esp)
- add %eax, %ebx
- adc %edx, %ecx
- lea -16(%esi), %esi
- mov 16(%esp), %eax
- mul %ebp
- add %eax, %ebx
- adc %edx, %ecx
- mov 20(%esp), %eax
- mul %edi
- mov %ebx, %ebp
- mov %ecx, %edi
-L(m0): add %eax, %ebp
- adc %edx, %edi
-L(m1): subl $4, 52(%esp)
- ja L(top)
-
-L(end): mov 4(%esp), %eax
- mul %edi
- mov 60(%esp), %edi
- add %eax, %ebp
- adc $0, %edx
- mov 4(%edi), %ecx
- mov %edx, %esi
- mov %ebp, %eax
- sal %cl, %esi
- mov %ecx, %ebx
- neg %ecx
- shr %cl, %eax
- or %esi, %eax
- lea 1(%eax), %esi
- mull (%edi)
- mov %ebx, %ecx
- mov %eax, %ebx
- mov %ebp, %eax
- mov 56(%esp), %ebp
- sal %cl, %eax
- add %eax, %ebx
- adc %esi, %edx
- imul %ebp, %edx
- sub %edx, %eax
- lea (%eax,%ebp), %edx
- cmp %eax, %ebx
- cmovc( %edx, %eax)
- mov %eax, %edx
- sub %ebp, %eax
- cmovc( %edx, %eax)
- add $28, %esp
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- shr %cl, %eax
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
-C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
- mov 24(%esp), %ebx
- bsr %ebx, %ecx
- xor $31, %ecx
- sal %cl, %ebx C b << cnt
- mov %ebx, %edx
- not %edx
- mov $-1, %eax
- div %ebx
- xor %edi, %edi
- sub %ebx, %edi
- mov $1, %esi
- mov %eax, (%ebp) C store bi
- mov %ecx, 4(%ebp) C store cnt
- shld %cl, %eax, %esi
- imul %edi, %esi
- mov %eax, %edi
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 8(%ebp) C store B1modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 12(%ebp) C store B2modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 16(%ebp) C store B3modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 20(%ebp) C store B4modb
-
- not %edx
- imul %ebx, %edx
- add %edx, %ebx
- cmp %edx, %eax
- cmovnc( %edx, %ebx)
-
- shr %cl, %ebx
- mov %ebx, 24(%ebp) C store B5modb
-
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/k7/mod_34lsub1.asm b/gmp/mpn/x86/k7/mod_34lsub1.asm
index ee3ad04099..f00e84dc42 100644
--- a/gmp/mpn/x86/k7/mod_34lsub1.asm
+++ b/gmp/mpn/x86/k7/mod_34lsub1.asm
@@ -1,32 +1,22 @@
dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/mode1o.asm b/gmp/mpn/x86/k7/mode1o.asm
index 6472ec5949..ef858049a6 100644
--- a/gmp/mpn/x86/k7/mode1o.asm
+++ b/gmp/mpn/x86/k7/mode1o.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -122,7 +111,7 @@ ifdef(`PIC',`
subl %eax, %edi C inv = 2*inv - inv*inv*d
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
movl %esi, %eax
imull %edi, %eax
cmpl $1, %eax')
diff --git a/gmp/mpn/x86/k7/mul_1.asm b/gmp/mpn/x86/k7/mul_1.asm
index 755cd2ed50..016262d594 100644
--- a/gmp/mpn/x86/k7/mul_1.asm
+++ b/gmp/mpn/x86/k7/mul_1.asm
@@ -1,38 +1,28 @@
dnl AMD K7 mpn_mul_1.
-dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
+C cycles/limb
+C P5:
C P6 model 0-8,10-12)
C P6 model 9 (Banias)
C P6 model 13 (Dothan)
@@ -41,9 +31,9 @@ C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6
-C AMD K7 3.25
-C AMD K8
+C K6:
+C K7: 3.25
+C K8:
C TODO
C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm
index 4dfb500885..7f4c0002f7 100644
--- a/gmp/mpn/x86/k7/mul_basecase.asm
+++ b/gmp/mpn/x86/k7/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/k7/sqr_basecase.asm b/gmp/mpn/x86/k7/sqr_basecase.asm
index 7b6a97e0df..408a13dc9b 100644
--- a/gmp/mpn/x86/k7/sqr_basecase.asm
+++ b/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -39,18 +28,18 @@ C roughly the Karatsuba recursing range).
dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
dnl some comments.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
-ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
-`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
+`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
-m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
-deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
-C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes,
C which is quite a bit, but is considered good value since squares big
C enough to use most of the code will be spending quite a few cycles in it.
diff --git a/gmp/mpn/x86/k7/sublsh1_n.asm b/gmp/mpn/x86/k7/sublsh1_n.asm
deleted file mode 100644
index 523b01218d..0000000000
--- a/gmp/mpn/x86/k7/sublsh1_n.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The
-C innerloop is 2*3-way unrolled, which is best we can do with the available
-C registers. It seems tricky to use the same structure for rsblsh1_n, since we
-C cannot feed carry between operations there.
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 6.75
-C AMD K6
-C AMD K7
-C AMD K8
-
-C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
-C processors. It uses 2*4-way unrolling, for good reasons.
-C
-C Breaking carry recurrency might be a good idea. We would then need separate
-C registers for the shift carry and add/subtract carry, which in turn would
-C force is to 2*2-way unrolling.
-
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_COUNT,`PARAM_SIZE')
-define(SAVE_EBX,`PARAM_SRC')
-define(SAVE_EBP,`PARAM_DST')
-
-ASM_START()
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_sublsh1_n_ip1)
-deflit(`FRAME',0)
-
-define(`rp', `%edi')
-define(`up', `%esi')
-
- mov PARAM_SIZE, %eax C size
- push up FRAME_pushl()
- push rp FRAME_pushl()
- xor %edx, %edx
- mov PARAM_SRC, up
- mov PARAM_DST, rp
- mov %ebx, SAVE_EBX
- mov %eax, %ebx
- shr $3, %eax
-
- not %eax C count = -(size\8)-i
- and $7, %ebx C size % 8
- jz L(exact)
-
-L(oop):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (up), %ecx
- adc %ecx, %ecx
- rcr %edx C restore 1st saved carry bit
- lea 4(up), up
- sbb %ecx, (rp)
- lea 4(rp), rp
- adc %edx, %edx C save a carry bit in edx
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- dec %ebx
- jnz L(oop)
-L(exact):
- inc %eax
- jz L(end)
- mov %eax, VAR_COUNT
- mov %ebp, SAVE_EBP
-
- ALIGN(16)
-L(top):
-ifdef(`CPU_P6',`
- shr %edx ') C restore 2nd saved carry bit
- mov (up), %eax
- adc %eax, %eax
- mov 4(up), %ebx
- adc %ebx, %ebx
- mov 8(up), %ecx
- adc %ecx, %ecx
- mov 12(up), %ebp
- adc %ebp, %ebp
-
- rcr %edx C restore 1st saved carry bit
-
- sbb %eax, (rp)
- sbb %ebx, 4(rp)
- sbb %ecx, 8(rp)
- sbb %ebp, 12(rp)
-
- mov 16(up), %eax
- adc %eax, %eax
- mov 20(up), %ebx
- adc %ebx, %ebx
- mov 24(up), %ecx
- adc %ecx, %ecx
- mov 28(up), %ebp
- adc %ebp, %ebp
-
- lea 32(up), up
- adc %edx, %edx C save a carry bit in edx
-
- sbb %eax, 16(rp)
- sbb %ebx, 20(rp)
- sbb %ecx, 24(rp)
- sbb %ebp, 28(rp)
-
-ifdef(`CPU_P6',`
- adc %edx, %edx ') C save another carry bit in edx
- incl VAR_COUNT
- lea 32(rp), rp
- jne L(top)
-
- mov SAVE_EBP, %ebp
-L(end):
- mov SAVE_EBX, %ebx
-
-ifdef(`CPU_P6',`
- xor %eax, %eax
- shr $1, %edx
- adc %edx, %eax
-',`
- adc $0, %edx
- mov %edx, %eax
-')
- pop rp FRAME_popl()
- pop up FRAME_popl()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/k8/gmp-mparam.h b/gmp/mpn/x86/k8/gmp-mparam.h
deleted file mode 100644
index 8d95fef80b..0000000000
--- a/gmp/mpn/x86/k8/gmp-mparam.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2500 MHz K8 Brisbane */
-/* FFT tuning limit = 10000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 1
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 40
-
-#define MUL_TOOM22_THRESHOLD 24
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 303
-#define MUL_TOOM8H_THRESHOLD 430
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 92
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 122
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 46
-#define SQR_TOOM3_THRESHOLD 78
-#define SQR_TOOM4_THRESHOLD 202
-#define SQR_TOOM6_THRESHOLD 286
-#define SQR_TOOM8_THRESHOLD 422
-
-#define MULMID_TOOM42_THRESHOLD 56
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 18
-
-#define MUL_FFT_MODF_THRESHOLD 848 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 848, 5}, { 27, 6}, { 25, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 167,10}, { 95, 9}, { 199,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 207,12}, \
- { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 335, 9}, { 671,11}, { 191,10}, { 383, 9}, \
- { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671, 9}, { 1343,12}, { 191,11}, \
- { 383,10}, { 799, 9}, { 1599,11}, { 415,10}, \
- { 863, 9}, { 1727,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1119,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
- { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \
- { 991,13}, { 255,12}, { 511,11}, { 1023,10}, \
- { 2111,11}, { 1119,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \
- { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \
- { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 144
-#define MUL_FFT_THRESHOLD 7552
-
-#define SQR_FFT_MODF_THRESHOLD 618 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 618, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \
- { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543, 8}, { 1087,10}, { 287,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671, 8}, { 1343,10}, { 351,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
- { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
- { 351,12}, { 191,11}, { 383,10}, { 799, 9}, \
- { 1599,11}, { 415,10}, { 863, 9}, { 1727,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
- { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
- { 799,10}, { 1599,11}, { 863,10}, { 1727,12}, \
- { 447,11}, { 959,10}, { 1919,11}, { 991,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
- { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \
- { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1471,11}, { 2943,13}, { 767,12}, { 1727,11}, \
- { 3455,13}, { 895,12}, { 1983,14}, { 511,13}, \
- { 1023,12}, { 2239,13}, { 1151,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 147
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 8
-#define MULLO_DC_THRESHOLD 31
-#define MULLO_MUL_N_THRESHOLD 14281
-
-#define DC_DIV_QR_THRESHOLD 91
-#define DC_DIVAPPR_Q_THRESHOLD 280
-#define DC_BDIV_QR_THRESHOLD 87
-#define DC_BDIV_Q_THRESHOLD 222
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 268
-#define INV_APPR_THRESHOLD 270
-
-#define BINV_NEWTON_THRESHOLD 260
-#define REDC_1_TO_REDC_N_THRESHOLD 79
-
-#define MU_DIV_QR_THRESHOLD 1718
-#define MU_DIVAPPR_Q_THRESHOLD 1528
-#define MUPI_DIV_QR_THRESHOLD 97
-#define MU_BDIV_QR_THRESHOLD 1470
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,22,114,416,1464
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 149
-#define HGCD_APPR_THRESHOLD 204
-#define HGCD_REDUCE_THRESHOLD 4455
-#define GCD_DC_THRESHOLD 599
-#define GCDEXT_DC_THRESHOLD 403
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 270
-#define SET_STR_PRECOMPUTE_THRESHOLD 1367
-
-#define FAC_DSC_THRESHOLD 348
-#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/x86/lshift.asm b/gmp/mpn/x86/lshift.asm
index 6ee6153cc2..5598599f8b 100644
--- a/gmp/mpn/x86/lshift.asm
+++ b/gmp/mpn/x86/lshift.asm
@@ -1,43 +1,33 @@
dnl x86 mpn_lshift -- mpn left shift.
-dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C P54 7.5
-C P55 7.0
-C P6 2.5
-C K6 4.5
-C K7 5.0
-C P4 14.5
+C P54: 7.5
+C P55: 7.0
+C P6: 2.5
+C K6: 4.5
+C K7: 5.0
+C P4: 14.5
C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
diff --git a/gmp/mpn/x86/mmx/sec_tabselect.asm b/gmp/mpn/x86/mmx/sec_tabselect.asm
deleted file mode 100644
index aae158abf7..0000000000
--- a/gmp/mpn/x86/mmx/sec_tabselect.asm
+++ /dev/null
@@ -1,163 +0,0 @@
-dnl X86 MMX mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb
-C ali,evn n unal,evn n
-C P5
-C P6 model 0-8,10-12
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 1.33 1.87
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood) 2.1 2.63
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona) 1.7 2.57
-C Intel Atom 1.85 2.7
-C AMD K6
-C AMD K7 1.33 1.33
-C AMD K8
-C AMD K10
-
-define(`rp', `%edi')
-define(`tp', `%esi')
-define(`n', `%edx')
-define(`nents', `%ecx')
-define(`which', `')
-
-define(`i', `%ebp')
-define(`j', `%ebx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- push %ebx
- push %esi
- push %edi
- push %ebp
-
- mov 20(%esp), rp
- mov 24(%esp), tp
- mov 28(%esp), n
- mov 32(%esp), nents
-
- movd 36(%esp), %mm6
- punpckldq %mm6, %mm6 C 2 copies of `which'
-
- mov $1, %ebx
- movd %ebx, %mm7
- punpckldq %mm7, %mm7 C 2 copies of 1
-
- mov n, j
- add $-4, j
- js L(outer_end)
-
-L(outer_top):
- mov nents, i
- mov tp, %eax
- pxor %mm1, %mm1
- pxor %mm4, %mm4
- pxor %mm5, %mm5
- ALIGN(16)
-L(top): movq %mm6, %mm0
- pcmpeqd %mm1, %mm0
- paddd %mm7, %mm1
- movq (tp), %mm2
- movq 8(tp), %mm3
- pand %mm0, %mm2
- pand %mm0, %mm3
- por %mm2, %mm4
- por %mm3, %mm5
- lea (tp,n,4), tp
- add $-1, i
- jne L(top)
-
- movq %mm4, (rp)
- movq %mm5, 8(rp)
-
- lea 16(%eax), tp
- lea 16(rp), rp
- add $-4, j
- jns L(outer_top)
-L(outer_end):
-
- test $2, %dl
- jz L(b0x)
-
-L(b1x): mov nents, i
- mov tp, %eax
- pxor %mm1, %mm1
- pxor %mm4, %mm4
- ALIGN(16)
-L(tp2): movq %mm6, %mm0
- pcmpeqd %mm1, %mm0
- paddd %mm7, %mm1
- movq (tp), %mm2
- pand %mm0, %mm2
- por %mm2, %mm4
- lea (tp,n,4), tp
- add $-1, i
- jne L(tp2)
-
- movq %mm4, (rp)
-
- lea 8(%eax), tp
- lea 8(rp), rp
-
-L(b0x): test $1, %dl
- jz L(b00)
-
-L(b01): mov nents, i
- pxor %mm1, %mm1
- pxor %mm4, %mm4
- ALIGN(16)
-L(tp1): movq %mm6, %mm0
- pcmpeqd %mm1, %mm0
- paddd %mm7, %mm1
- movd (tp), %mm2
- pand %mm0, %mm2
- por %mm2, %mm4
- lea (tp,n,4), tp
- add $-1, i
- jne L(tp1)
-
- movd %mm4, (rp)
-
-L(b00): pop %ebp
- pop %edi
- pop %esi
- pop %ebx
- emms
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/mod_1.asm b/gmp/mpn/x86/mod_1.asm
new file mode 100644
index 0000000000..0fa3ce0def
--- /dev/null
+++ b/gmp/mpn/x86/mod_1.asm
@@ -0,0 +1,163 @@
+dnl x86 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C 486 42 approx, maybe
+C P5 44
+C P6 39
+C K6 20
+C K7 41
+C P4 58
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C Essentially this code is the same as the division based part of
+C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
+C instruction even when gcc is not being used (where longlong.h only has the
+C rather slow generic C udiv_qrnnd().
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done. A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount. The branch misprediction penalty on each chip is less
+C than half a div.
+C
+C
+C Notes for K6:
+C
+C Back-to-back div instructions take 20 cycles, the same as the loop here,
+C so it seems there's nothing to gain by rearranging. Pairing the mov and
+C loop instructions was found to gain nothing. Normally we use a loop
+C instruction rather than decl/jnz, but it gains nothing here.
+C
+C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
+C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
+C If an inverse takes about 40 cycles for normalized or perhaps 60 for
+C unnormalized (due to bsfl being slow on k6) then the threshold would be at
+C least 20 or 30 limbs.
+C
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %esi FRAME_pushl()
+
+ orl %ecx, %ecx
+ jz L(done_zero)
+
+ movl PARAM_DIVISOR, %esi
+ movl -4(%ebx,%ecx,4), %eax C src high limb
+
+ cmpl %esi, %eax
+
+ sbbl %edx, %edx C -1 if high<divisor
+
+ addl %edx, %ecx C skip one division if high<divisor
+ jz L(done_eax)
+
+ andl %eax, %edx C carry if high<divisor
+
+
+L(top):
+ C eax scratch (quotient)
+ C ebx src
+ C ecx counter
+ C edx carry (remainder)
+ C esi divisor
+ C edi
+ C ebp
+
+ movl -4(%ebx,%ecx,4), %eax
+
+ divl %esi
+
+ decl %ecx
+ jnz L(top)
+
+
+ movl %edx, %eax
+L(done_eax):
+ popl %esi
+
+ popl %ebx
+
+ ret
+
+EPILOGUE()
+
+
+ C This code located after mpn_mod_1, so the jump to L(top) here is
+ C back and hence will be predicted as taken. (size==0 is considered
+ C unlikely.)
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx, %ecx
+
+ movl PARAM_CARRY, %edx
+ jnz L(top)
+
+ popl %esi
+ movl %edx, %eax
+
+ popl %ebx
+
+ ret
+
+
+ C This code is for mpn_mod_1, but is positioned here to save some
+ C space in the alignment padding.
+ C
+L(done_zero):
+ popl %esi
+ xorl %eax, %eax
+
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/mod_34lsub1.asm b/gmp/mpn/x86/mod_34lsub1.asm
index e09e702c6f..68b4a73dbc 100644
--- a/gmp/mpn/x86/mod_34lsub1.asm
+++ b/gmp/mpn/x86/mod_34lsub1.asm
@@ -1,42 +1,31 @@
dnl Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
-dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C P5 3.0
-C P6 3.66
-C K6 3.0
-C K7 1.3
-C P4 9
+C P5: 3.0
+C P6: 3.66
+C K6: 3.0
+C K7: 1.3
+C P4: 9
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
diff --git a/gmp/mpn/x86/mul_1.asm b/gmp/mpn/x86/mul_1.asm
index 421de62225..1d715ece7e 100644
--- a/gmp/mpn/x86/mul_1.asm
+++ b/gmp/mpn/x86/mul_1.asm
@@ -1,50 +1,40 @@
dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
dnl with a limb and store the result in a second limb vector.
-dnl Copyright 1992, 1994, 1997-2002, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1997, 1998, 1999, 2000, 2001, 2002, 2005 Free
+dnl Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5 12.5
-C P6 model 0-8,10-12 5.5
+C cycles/limb
+C P5: 12.5
+C P6 model 0-8,10-12) 5.5
C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 5.25
-C P4 model 0 (Willamette) 19.0
-C P4 model 1 (?) 19.0
-C P4 model 2 (Northwood) 19.0
+C P6 model 13 (Dothan) 5.25
+C P4 model 0 (Willamette) 19.0
+C P4 model 1 (?) 19.0
+C P4 model 2 (Northwood) 19.0
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6 10.5
-C AMD K7 4.5
-C AMD K8
+C K6: 10.5
+C K7: 4.5
+C K8:
C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
diff --git a/gmp/mpn/x86/mul_basecase.asm b/gmp/mpn/x86/mul_basecase.asm
index 8339732a80..7918ea07f3 100644
--- a/gmp/mpn/x86/mul_basecase.asm
+++ b/gmp/mpn/x86/mul_basecase.asm
@@ -1,43 +1,33 @@
dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
dnl in a third limb vector.
-dnl Copyright 1996-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/crossproduct
-C P5 15
-C P6 7.5
-C K6 12.5
-C K7 5.5
-C P4 24
+C P5: 15
+C P6: 7.5
+C K6: 12.5
+C K7: 5.5
+C P4: 24
C void mpn_mul_basecase (mp_ptr wp,
diff --git a/gmp/mpn/x86/nano/gmp-mparam.h b/gmp/mpn/x86/nano/gmp-mparam.h
deleted file mode 100644
index cd8ac4e1d6..0000000000
--- a/gmp/mpn/x86/nano/gmp-mparam.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
-
-#define MOD_1_1P_METHOD 1
-#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
-#define USE_PREINV_DIVREM_1 1
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 32
-
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 132
-#define MUL_TOOM44_THRESHOLD 195
-#define MUL_TOOM6H_THRESHOLD 270
-#define MUL_TOOM8H_THRESHOLD 478
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 194
-#define SQR_TOOM4_THRESHOLD 502
-#define SQR_TOOM6_THRESHOLD 746
-#define SQR_TOOM8_THRESHOLD 1005
-
-#define MULMID_TOOM42_THRESHOLD 40
-
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 19
-
-#define POWM_SEC_TABLE 4,23,258,828,2246
-
-#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
- { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \
- { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \
- { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \
- { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \
- { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \
- { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \
- { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \
- { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
- { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 89
-#define MUL_FFT_THRESHOLD 1856
-
-#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \
- { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \
- { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
- { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \
- { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \
- { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \
- { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \
- { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \
- { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \
- { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 87
-#define SQR_FFT_THRESHOLD 2368
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 51
-#define MULLO_MUL_N_THRESHOLD 3369
-
-#define DC_DIV_QR_THRESHOLD 56
-#define DC_DIVAPPR_Q_THRESHOLD 183
-#define DC_BDIV_QR_THRESHOLD 55
-#define DC_BDIV_Q_THRESHOLD 118
-
-#define INV_MULMOD_BNM1_THRESHOLD 30
-#define INV_NEWTON_THRESHOLD 266
-#define INV_APPR_THRESHOLD 218
-
-#define BINV_NEWTON_THRESHOLD 268
-#define REDC_1_TO_REDC_N_THRESHOLD 56
-
-#define MU_DIV_QR_THRESHOLD 1308
-#define MU_DIVAPPR_Q_THRESHOLD 1528
-#define MUPI_DIV_QR_THRESHOLD 124
-#define MU_BDIV_QR_THRESHOLD 855
-#define MU_BDIV_Q_THRESHOLD 1334
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 104
-#define HGCD_APPR_THRESHOLD 139
-#define HGCD_REDUCE_THRESHOLD 2121
-#define GCD_DC_THRESHOLD 456
-#define GCDEXT_DC_THRESHOLD 321
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 542
-#define SET_STR_PRECOMPUTE_THRESHOLD 840
diff --git a/gmp/mpn/x86/p6/README b/gmp/mpn/x86/p6/README
index f19d47b94f..1ded4e7177 100644
--- a/gmp/mpn/x86/p6/README
+++ b/gmp/mpn/x86/p6/README
@@ -3,28 +3,17 @@ Copyright 2000, 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
@@ -63,7 +52,7 @@ Some of these might be able to be improved.
mpn_mul_basecase 8.2 cycles/crossproduct (approx)
mpn_sqr_basecase 4.0 cycles/crossproduct (approx)
- or 7.75 cycles/triangleproduct (approx)
+ or 7.75 cycles/triangleproduct (approx)
Pentium II and III have MMX and get the following improvements.
diff --git a/gmp/mpn/x86/p6/aors_n.asm b/gmp/mpn/x86/p6/aors_n.asm
index df51c2e6f7..f4652ec2cb 100644
--- a/gmp/mpn/x86/p6/aors_n.asm
+++ b/gmp/mpn/x86/p6/aors_n.asm
@@ -1,43 +1,32 @@
dnl Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract.
dnl Copyright 2006 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C TODO:
-C * Avoid indexed addressing, it makes us stall on the two-ported register
+C * Avoid indexed adressing, it makes us stall on the two-ported register
C file.
-C cycles/limb
-C P6 model 0-8,10-12 3.17
-C P6 model 9 (Banias) 2.15
-C P6 model 13 (Dothan) 2.25
+C cycles/limb
+C P6 model 0-8,10-12) 3.17
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 2.25
define(`rp', `%edi')
diff --git a/gmp/mpn/x86/p6/aorsmul_1.asm b/gmp/mpn/x86/p6/aorsmul_1.asm
index bc8c49c62e..746bf05f12 100644
--- a/gmp/mpn/x86/p6/aorsmul_1.asm
+++ b/gmp/mpn/x86/p6/aorsmul_1.asm
@@ -1,49 +1,38 @@
dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P5
-C P6 model 0-8,10-12 6.44
-C P6 model 9 (Banias) 6.15
-C P6 model 13 (Dothan) 6.11
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12) 6.44
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 6.11
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
-C AMD K6
-C AMD K7
-C AMD K8
+C K6:
+C K7:
+C K8:
dnl P6 UNROLL_COUNT cycles/limb
@@ -181,7 +170,7 @@ C registers when doing the mul for the initial two carry limbs.
C
C The add/adc for the initial carry in %ebx is necessary only for the
C mpn_add/submul_1c entry points. Duplicating the startup code to
-C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
C idea.
dnl overlapping with parameters already fetched
diff --git a/gmp/mpn/x86/p6/bdiv_q_1.asm b/gmp/mpn/x86/p6/bdiv_q_1.asm
deleted file mode 100644
index 2cc179c238..0000000000
--- a/gmp/mpn/x86/p6/bdiv_q_1.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
-
-dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C odd even divisor
-C P6: 10.0 12.0 cycles/limb
-
-C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
-
-C The odd case is basically the same as mpn_modexact_1_odd, just with an
-C extra store, and it runs at the same 10 cycles which is the dependent
-C chain.
-C
-C The shifts for the even case aren't on the dependent chain so in principle
-C it could run the same too, but nothing running at 10 has been found.
-C Perhaps there's too many uops (an extra 4 over the odd case).
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(STACK_SPACE, 16)
-
-dnl re-use parameter space
-define(VAR_INVERSE,`PARAM_SRC')
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
-
- ALIGN(16)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebx, SAVE_EBX
- movl PARAM_SIZE, %ebx
-
- movl %ebp, SAVE_EBP
- movl PARAM_INVERSE, %ebp
-
- movl PARAM_SHIFT, %ecx C trailing twos
-
-L(common):
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- leal (%esi,%ebx,4), %esi C src end
-
- leal (%edi,%ebx,4), %edi C dst end
- negl %ebx C -size
-
- movl (%esi,%ebx,4), %eax C src[0]
-
- orl %ecx, %ecx
- jz L(odd_entry)
-
- movl %edi, PARAM_DST
- movl %ebp, VAR_INVERSE
-
-L(even):
- C eax src[0]
- C ebx counter, limbs, negative
- C ecx shift
- C edx
- C esi
- C edi
- C ebp
-
- xorl %ebp, %ebp C initial carry bit
- xorl %edx, %edx C initial carry limb (for size==1)
-
- incl %ebx
- jz L(even_one)
-
- movl (%esi,%ebx,4), %edi C src[1]
-
- shrdl( %cl, %edi, %eax)
-
- jmp L(even_entry)
-
-
-L(even_top):
- C eax scratch
- C ebx counter, limbs, negative
- C ecx shift
- C edx scratch
- C esi &src[size]
- C edi &dst[size] and scratch
- C ebp carry bit
-
- movl (%esi,%ebx,4), %edi
-
- mull PARAM_DIVISOR
-
- movl -4(%esi,%ebx,4), %eax
- shrdl( %cl, %edi, %eax)
-
- subl %ebp, %eax
-
- sbbl %ebp, %ebp
- subl %edx, %eax
-
- sbbl $0, %ebp
-
-L(even_entry):
- imull VAR_INVERSE, %eax
-
- movl PARAM_DST, %edi
- negl %ebp
-
- movl %eax, -4(%edi,%ebx,4)
- incl %ebx
- jnz L(even_top)
-
- mull PARAM_DIVISOR
-
- movl -4(%esi), %eax
-
-L(even_one):
- shrl %cl, %eax
- movl SAVE_ESI, %esi
-
- subl %ebp, %eax
- movl SAVE_EBP, %ebp
-
- subl %edx, %eax
- movl SAVE_EBX, %ebx
-
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi)
- movl SAVE_EDI, %edi
- addl $STACK_SPACE, %esp
-
- ret
-
-C The dependent chain here is
-C
-C subl %edx, %eax 1
-C imull %ebp, %eax 4
-C mull PARAM_DIVISOR 5
-C ----
-C total 10
-C
-C and this is the measured speed. No special scheduling is necessary, out
-C of order execution hides the load latency.
-
-L(odd_top):
- C eax scratch (src limb)
- C ebx counter, limbs, negative
- C ecx carry bit
- C edx carry limb, high of last product
- C esi &src[size]
- C edi &dst[size]
- C ebp inverse
-
- mull PARAM_DIVISOR
-
- movl (%esi,%ebx,4), %eax
- subl %ecx, %eax
-
- sbbl %ecx, %ecx
- subl %edx, %eax
-
- sbbl $0, %ecx
-
-L(odd_entry):
- imull %ebp, %eax
-
- movl %eax, (%edi,%ebx,4)
- negl %ecx
-
- incl %ebx
- jnz L(odd_top)
-
-
- movl SAVE_ESI, %esi
-
- movl SAVE_EDI, %edi
-
- movl SAVE_EBP, %ebp
-
- movl SAVE_EBX, %ebx
- addl $STACK_SPACE, %esp
-
- ret
-
-EPILOGUE()
-
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-
- ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_DIVISOR, %eax
- subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebx, SAVE_EBX
- movl PARAM_SIZE, %ebx
-
- bsfl %eax, %ecx C trailing twos
-
- movl %ebp, SAVE_EBP
-
- shrl %cl, %eax C d without twos
-
- movl %eax, %edx
- shrl %eax C d/2 without twos
-
- movl %edx, PARAM_DIVISOR
- andl $127, %eax
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %ebp)
- movzbl (%eax,%ebp), %ebp C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
-')
-
- leal (%ebp,%ebp), %eax C 2*inv
-
- imull %ebp, %ebp C inv*inv
- imull %edx, %ebp C inv*inv*d
-
- subl %ebp, %eax C inv = 2*inv - inv*inv*d
- leal (%eax,%eax), %ebp C 2*inv
-
- imull %eax, %eax C inv*inv
- imull %edx, %eax C inv*inv*d
-
- subl %eax, %ebp C inv = 2*inv - inv*inv*d
-
- jmp L(common)
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/p6/copyd.asm b/gmp/mpn/x86/p6/copyd.asm
index 1be7636835..2946f51e7a 100644
--- a/gmp/mpn/x86/p6/copyd.asm
+++ b/gmp/mpn/x86/p6/copyd.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_copyd -- copy limb vector backwards.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/dive_1.asm b/gmp/mpn/x86/p6/dive_1.asm
index aa7ba880c9..e8efc28eac 100644
--- a/gmp/mpn/x86/p6/dive_1.asm
+++ b/gmp/mpn/x86/p6/dive_1.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -117,7 +106,7 @@ ifdef(`PIC',`
subl %eax, %ebp C inv = 2*inv - inv*inv*d
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
movl PARAM_DIVISOR, %eax
imull %ebp, %eax
cmpl $1, %eax')
@@ -138,7 +127,7 @@ C subl %edx, %eax 1
C imull %ebp, %eax 4
C mull PARAM_DIVISOR 5
C ----
-C total 10
+C total 10
C
C and this is the measured speed. No special scheduling is necessary, out
C of order execution hides the load latency.
diff --git a/gmp/mpn/x86/p6/gcd_1.asm b/gmp/mpn/x86/p6/gcd_1.asm
deleted file mode 100644
index f6518f6e19..0000000000
--- a/gmp/mpn/x86/p6/gcd_1.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-dnl x86 mpn_gcd_1 optimised for processors with fast BSF.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K7 7.80
-C AMD K8,K9 7.79
-C AMD K10 4.08
-C AMD bd1 ?
-C AMD bobcat 7.82
-C Intel P4-2 14.9
-C Intel P4-3/4 14.0
-C Intel P6/13 5.09
-C Intel core2 4.22
-C Intel NHM 5.00
-C Intel SBR 5.00
-C Intel atom 17.1
-C VIA nano ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-
-define(`up', `%edi')
-define(`n', `%esi')
-define(`v0', `%edx')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- push %edi
- push %esi
-
- mov 12(%esp), up
- mov 16(%esp), n
- mov 20(%esp), v0
-
- mov (up), %eax C U low limb
- or v0, %eax
- bsf %eax, %eax C min(ctz(u0),ctz(v0))
-
- bsf v0, %ecx
- shr %cl, v0
-
- push %eax C preserve common twos over call
- push v0 C preserve v0 argument over call
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %ecx
- mov %ecx, %eax
- shr $BMOD_THRES_LOG2, %ecx
- cmp %ecx, v0
- ja L(reduced)
- jmp L(bmod)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-ifdef(`PIC_WITH_EBX',`
- push %ebx
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- push v0 C param 3
- push n C param 2
- push up C param 1
- CALL( mpn_mod_1)
- jmp L(called)
-
-L(bmod):
-ifdef(`PIC_WITH_EBX',`dnl
- push %ebx
- call L(movl_eip_to_ebx)
- add $_GLOBAL_OFFSET_TABLE_, %ebx
-')
- push v0 C param 3
- push n C param 2
- push up C param 1
- CALL( mpn_modexact_1_odd)
-
-L(called):
- add $12, %esp C deallocate params
-ifdef(`PIC_WITH_EBX',`dnl
- pop %ebx
-')
-L(reduced):
- pop %edx
-
- bsf %eax, %ecx
-C test %eax, %eax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc( %esi, %eax) C if x-y < 0 0,3 0,3 0,6 0,5 0,5
- cmovc( %edi, %edx) C use x,y-x 0,3 0,3 2,8 1,7 1,7
-L(mid): shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8
- mov %edx, %esi C 1 1 4 3 3
- sub %eax, %esi C 2 2 5 4 4
- bsf %esi, %ecx C 3 3 6 5 5
- mov %eax, %edi C 2 2 3 3 4
- sub %edx, %eax C 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %ecx
- mov %edx, %eax
- shl %cl, %eax
-
- pop %esi
- pop %edi
- ret
-
-ifdef(`PIC_WITH_EBX',`dnl
-L(movl_eip_to_ebx):
- mov (%esp), %ebx
- ret
-')
-EPILOGUE()
diff --git a/gmp/mpn/x86/p6/gmp-mparam.h b/gmp/mpn/x86/p6/gmp-mparam.h
index 96c96fd558..a85c500275 100644
--- a/gmp/mpn/x86/p6/gmp-mparam.h
+++ b/gmp/mpn/x86/p6/gmp-mparam.h
@@ -1,194 +1,70 @@
/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-or both in parallel, as here.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-
-/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
- value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
- mpn/x86/p6/sqr_basecase.asm. */
-
-
-/* 1867 MHz P6 model 13 */
-
-#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 21
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 181
-#define MUL_TOOM6H_THRESHOLD 252
-#define MUL_TOOM8H_THRESHOLD 363
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 101
-#define SQR_TOOM4_THRESHOLD 154
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 527
-
-#define MULMID_TOOM42_THRESHOLD 58
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define POWM_SEC_TABLE 4,23,258,768,2388
-
-#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
- { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
- { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
- { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
- { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
- { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
- { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
- { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 132
-#define MUL_FFT_THRESHOLD 6784
-
-#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
- { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
- { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
- { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
- { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
- { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
- { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
- { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
- { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
- { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
- { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
- { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
- { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 146
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 20
-#define DC_DIVAPPR_Q_THRESHOLD 56
-#define DC_BDIV_QR_THRESHOLD 60
-#define DC_BDIV_Q_THRESHOLD 134
-
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 66
-#define INV_APPR_THRESHOLD 63
-
-#define BINV_NEWTON_THRESHOLD 250
-#define REDC_1_TO_REDC_N_THRESHOLD 63
-
-#define MU_DIV_QR_THRESHOLD 1164
-#define MU_DIVAPPR_Q_THRESHOLD 979
-#define MUPI_DIV_QR_THRESHOLD 38
-#define MU_BDIV_QR_THRESHOLD 1442
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 64
-#define HGCD_APPR_THRESHOLD 105
-#define HGCD_REDUCE_THRESHOLD 3524
-#define GCD_DC_THRESHOLD 386
-#define GCDEXT_DC_THRESHOLD 309
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 587
-#define SET_STR_PRECOMPUTE_THRESHOLD 1104
+
+/* NOTE: In a fat binary build SQR_KARATSUBA_THRESHOLD here cannot be
+ smaller than the value in mpn/x86/p6/mmx/gmp-mparam.h. The former is
+ used as a hard limit in mpn/x86/p6/sqr_basecase.asm, and that file will
+ be run by the p6/mmx cpus (pentium2, pentium3). */
+
+
+/* 200MHz Pentium Pro */
+
+/* Generated by tuneup.c, 2003-02-12, gcc 2.95 */
+
+#define MUL_KARATSUBA_THRESHOLD 23
+#define MUL_TOOM3_THRESHOLD 140
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_KARATSUBA_THRESHOLD 52
+#define SQR_TOOM3_THRESHOLD 189
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 116
+#define POWM_THRESHOLD 131
+
+#define GCD_ACCEL_THRESHOLD 3
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_THRESHOLD 6093
+
+#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 360
+#define MUL_FFT_THRESHOLD 2816
+
+#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 440
+#define SQR_FFT_THRESHOLD 2816
+
+#define MUL_FFT_TABLE2 {{1,4}, {305,5}, {321,4}, {337,5}, {353,4}, {369,5}, {801,6}, {833,5}, {865,6}, {897,5}, {929,6}, {961,5}, {993,6}, {1345,7}, {1409,6}, {1537,7}, {1665,6}, {1729,7}, {2689,8}, {2817,7}, {3201,8}, {3329,7}, {3457,8}, {3841,7}, {3969,8}, {4097,7}, {4225,8}, {4353,7}, {4481,8}, {5889,7}, {6017,8}, {6401,7}, {6529,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {9985,9}, {10241,8}, {11009,9}, {11777,8}, {12289,9}, {13825,10}, {15361,9}, {15873,8}, {16129,9}, {19969,10}, {23553,9}, {24065,8}, {24321,9}, {26113,10}, {27649,11}, {28673,10}, {31745,9}, {34305,10}, {34817,9}, {35329,10}, {39937,9}, {40449,10}, {48129,11}, {55297,10}, {56321,11}, {63489,10}, {80897,11}, {96257,10}, {97281,12}, {126977,11}, {129025,10}, {130049,9}, {130561,10}, {131073,11}, {133121,10}, {134145,11}, {137217,10}, {138241,11}, {161793,10}, {162817,11}, {194561,12}, {258049,11}, {260097,10}, {261121,9}, {261633,10}, {266241,11}, {268289,10}, {277505,11}, {292865,10}, {293889,9}, {294401,10}, {310273,9}, {310785,11}, {325633,10}, {326657,12}, {389121,13}, {516097,12}, {520193,11}, {522241,10}, {523265,11}, {555009,10}, {556033,11}, {587777,10}, {588801,11}, {620545,10}, {621569,9}, {622081,11}, {622593,12}, {651265,11}, {653313,10}, {654337,11}, {655361,10}, {657409,11}, {663553,10}, {664577,11}, {686081,10}, {687105,11}, {718849,10}, {719873,11}, {720897,10}, {722945,11}, {737281,10}, {740353,11}, {745473,10}, {749569,11}, {751617,10}, {752641,9}, {753153,11}, {753665,12}, {770049,11}, {774145,12}, {782337,11}, {786433,10}, {787457,11}, {817153,10}, {818177,11}, {849921,10}, {850945,11}, {854017,10}, {855041,11}, {862209,10}, {863233,11}, {866305,10}, {867329,11}, {876545,10}, {877569,11}, {882689,10}, {883713,9}, {884225,11}, {884737,13}, {1040385,12}, {1044481,11}, {1112065,10}, {1113089,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,10}, {1375233,12}, {1437697,11}, {1505281,10}, {1506305,12}, {1515521,13}, {1523713,12}, {1527809,13}, {1540097,12}, {1544193,13}, {1548289,12}, {1568769,11}, {1636353,10}, {1637377,12}, {1699841,11}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {273,5}, {289,4}, {305,5}, {673,6}, {705,5}, {737,6}, {769,5}, {801,6}, {1345,7}, {1409,6}, {1537,7}, {1665,6}, {1729,7}, {2689,8}, {2817,7}, {3201,8}, {3329,7}, {3713,8}, {3841,7}, {4225,8}, {4865,7}, {4993,9}, {5121,8}, {6657,9}, {7681,8}, {8961,9}, {11777,8}, {12033,10}, {12289,8}, {12545,9}, {13825,10}, {14337,9}, {14849,10}, {15361,9}, {19969,10}, {23553,9}, {24577,11}, {30721,10}, {31745,9}, {32257,10}, {37889,9}, {38401,10}, {39937,9}, {40449,10}, {48129,11}, {63489,10}, {80897,11}, {96257,12}, {126977,11}, {129025,10}, {130049,11}, {194561,12}, {208897,11}, {210945,12}, {258049,11}, {260097,9}, {269313,10}, {277505,9}, {278017,11}, {278529,10}, {280577,11}, {282625,10}, {283649,11}, {284673,10}, {285697,11}, {286721,10}, {289793,11}, {290817,10}, {293889,9}, {294401,10}, {310273,9}, {310785,8}, {311041,10}, {311297,11}, {315393,10}, {321537,12}, {323585,11}, {325633,10}, {326657,12}, {331777,10}, {332801,12}, {389121,10}, {392193,9}, {392705,10}, {413697,9}, {414209,10}, {418817,9}, {419841,10}, {424961,9}, {425473,10}, {441345,9}, {441857,10}, {449537,9}, {450561,10}, {452609,9}, {453121,10}, {454657,9}, {455169,10}, {490497,12}, {491521,13}, {516097,12}, {520193,10}, {523265,11}, {555009,10}, {556033,11}, {587777,10}, {588801,11}, {620545,10}, {621569,9}, {622081,11}, {624641,12}, {626689,11}, {653313,10}, {654337,11}, {686081,10}, {687105,11}, {718849,10}, {720897,11}, {722945,10}, {724993,11}, {729089,10}, {734209,11}, {737281,10}, {744449,11}, {745473,10}, {747521,11}, {749569,10}, {752641,11}, {784385,10}, {785409,11}, {808961,10}, {809985,11}, {817153,10}, {818177,11}, {849921,10}, {850945,11}, {851969,10}, {852993,11}, {858113,10}, {859137,11}, {860161,10}, {861185,11}, {882689,10}, {883713,11}, {980993,13}, {1040385,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,10}, {1375233,12}, {1437697,11}, {1505281,10}, {1506305,12}, {1568769,11}, {1636353,10}, {1637377,12}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/x86/p6/lshsub_n.asm b/gmp/mpn/x86/p6/lshsub_n.asm
index 7ada213644..a3086bdbc2 100644
--- a/gmp/mpn/x86/p6/lshsub_n.asm
+++ b/gmp/mpn/x86/p6/lshsub_n.asm
@@ -1,38 +1,27 @@
dnl Intel P6 mpn_lshsub_n -- mpn papillion support.
dnl Copyright 2006 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12)
-C (1) The loop is not scheduled in any way, and scheduling attempts have not
+C (1) The loop is is not scheduled in any way, and scheduling attempts have not
C improved speed on P6/13. Presumably, the K7 will want scheduling, if it
C at all wants to use MMX.
C (2) We could save a register by not alternatingly using eax and edx in the
diff --git a/gmp/mpn/x86/p6/mmx/divrem_1.asm b/gmp/mpn/x86/p6/mmx/divrem_1.asm
index 5300616c14..8891f3a843 100644
--- a/gmp/mpn/x86/p6/mmx/divrem_1.asm
+++ b/gmp/mpn/x86/p6/mmx/divrem_1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/mmx/gmp-mparam.h b/gmp/mpn/x86/p6/mmx/gmp-mparam.h
index 35c3aadfc1..47602f562e 100644
--- a/gmp/mpn/x86/p6/mmx/gmp-mparam.h
+++ b/gmp/mpn/x86/p6/mmx/gmp-mparam.h
@@ -1,198 +1,79 @@
/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2005, 2009, 2010 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-or both in parallel, as here.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-
-/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
- value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
- mpn/x86/p6/sqr_basecase.asm. */
-
-
-/* 800 MHz P6 model 8 */
-
-#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 49
-
-#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 73
-#define MUL_TOOM44_THRESHOLD 193
-#define MUL_TOOM6H_THRESHOLD 254
-#define MUL_TOOM8H_THRESHOLD 381
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 81
-#define SQR_TOOM4_THRESHOLD 142
-#define SQR_TOOM6_THRESHOLD 258
-#define SQR_TOOM8_THRESHOLD 399
-
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 18
-
-#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
- { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
- { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 167,10}, { 95, 9}, { 199,10}, \
- { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \
- { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
- { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
- { 223,10}, { 447,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,10}, { 831,11}, { 447,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 703,10}, { 1407,11}, { 735,12}, { 383,11}, \
- { 831,12}, { 447,11}, { 959,10}, { 1919,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
- { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 831,11}, { 1727,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \
- { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
- { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \
- { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \
- { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
- { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
- { 3327,13}, { 1919,12}, { 3839,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 160
-#define MUL_FFT_THRESHOLD 7040
-
-#define SQR_FFT_MODF_THRESHOLD 376 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
- { 255, 9}, { 135,10}, { 79, 9}, { 167,10}, \
- { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
- { 271,10}, { 143, 9}, { 287, 8}, { 575, 9}, \
- { 303, 8}, { 607,10}, { 159, 9}, { 319,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
- { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
- { 479,12}, { 127,11}, { 255,10}, { 543, 9}, \
- { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
- { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \
- { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \
- { 447,11}, { 959,10}, { 1919,13}, { 255,12}, \
- { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
- { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \
- { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
- { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
- { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
- { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \
- { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
- { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \
- { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
- { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 161
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 8
-#define MULLO_DC_THRESHOLD 60
-#define MULLO_MUL_N_THRESHOLD 13765
-
-#define DC_DIV_QR_THRESHOLD 83
-#define DC_DIVAPPR_Q_THRESHOLD 246
-#define DC_BDIV_QR_THRESHOLD 76
-#define DC_BDIV_Q_THRESHOLD 175
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 268
-#define INV_APPR_THRESHOLD 250
-
-#define BINV_NEWTON_THRESHOLD 276
-#define REDC_1_TO_REDC_N_THRESHOLD 74
-
-#define MU_DIV_QR_THRESHOLD 1442
-#define MU_DIVAPPR_Q_THRESHOLD 1442
-#define MUPI_DIV_QR_THRESHOLD 132
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1334
-
-#define MATRIX22_STRASSEN_THRESHOLD 18
-#define HGCD_THRESHOLD 121
-#define GCD_DC_THRESHOLD 478
-#define GCDEXT_DC_THRESHOLD 361
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 272
-#define SET_STR_PRECOMPUTE_THRESHOLD 1074
+
+/* NOTE: In a fat binary build SQR_KARATSUBA_THRESHOLD here cannot be more
+ than the value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard
+ limit in mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 1867 MHz P6 model 13 */
+
+/* Generated by tuneup.c, 2009-03-02, gcc 4.3 */
+
+#define MUL_KARATSUBA_THRESHOLD 20
+#define MUL_TOOM3_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 166
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
+
+#define MULLOW_BASECASE_THRESHOLD 7
+#define MULLOW_DC_THRESHOLD 39
+#define MULLOW_MUL_N_THRESHOLD 230
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 21
+#define POWM_THRESHOLD 154
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD_THRESHOLD 72
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 416
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 587
+#define SET_STR_PRECOMPUTE_THRESHOLD 1083
+
+#define MUL_FFT_TABLE { 400, 928, 1664, 4608, 10240, 57344, 163840, 393216, 0 }
+#define MUL_FFT_MODF_THRESHOLD 496
+#define MUL_FFT_THRESHOLD 7168
+
+#define SQR_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 98304, 393216, 0 }
+#define SQR_FFT_MODF_THRESHOLD 448
+#define SQR_FFT_THRESHOLD 3840
+
+/* These tables need updating */
+#define MUL_FFT_TABLE2 {{1,4}, {305,5}, {321,4}, {337,5}, {353,4}, {369,5}, {801,6}, {833,5}, {865,6}, {897,5}, {929,6}, {961,5}, {993,6}, {1345,7}, {1409,6}, {1537,7}, {1665,6}, {1729,7}, {2689,8}, {2817,7}, {3201,8}, {3329,7}, {3457,8}, {3841,7}, {3969,8}, {4097,7}, {4225,8}, {4353,7}, {4481,8}, {5889,7}, {6017,8}, {6401,7}, {6529,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {9985,9}, {10241,8}, {11009,9}, {11777,8}, {12289,9}, {13825,10}, {15361,9}, {15873,8}, {16129,9}, {19969,10}, {23553,9}, {24065,8}, {24321,9}, {26113,10}, {27649,11}, {28673,10}, {31745,9}, {34305,10}, {34817,9}, {35329,10}, {39937,9}, {40449,10}, {48129,11}, {55297,10}, {56321,11}, {63489,10}, {80897,11}, {96257,10}, {97281,12}, {126977,11}, {129025,10}, {130049,9}, {130561,10}, {131073,11}, {133121,10}, {134145,11}, {137217,10}, {138241,11}, {161793,10}, {162817,11}, {194561,12}, {258049,11}, {260097,10}, {261121,9}, {261633,10}, {266241,11}, {268289,10}, {277505,11}, {292865,10}, {293889,9}, {294401,10}, {310273,9}, {310785,11}, {325633,10}, {326657,12}, {389121,13}, {516097,12}, {520193,11}, {522241,10}, {523265,11}, {555009,10}, {556033,11}, {587777,10}, {588801,11}, {620545,10}, {621569,9}, {622081,11}, {622593,12}, {651265,11}, {653313,10}, {654337,11}, {655361,10}, {657409,11}, {663553,10}, {664577,11}, {686081,10}, {687105,11}, {718849,10}, {719873,11}, {720897,10}, {722945,11}, {737281,10}, {740353,11}, {745473,10}, {749569,11}, {751617,10}, {752641,9}, {753153,11}, {753665,12}, {770049,11}, {774145,12}, {782337,11}, {786433,10}, {787457,11}, {817153,10}, {818177,11}, {849921,10}, {850945,11}, {854017,10}, {855041,11}, {862209,10}, {863233,11}, {866305,10}, {867329,11}, {876545,10}, {877569,11}, {882689,10}, {883713,9}, {884225,11}, {884737,13}, {1040385,12}, {1044481,11}, {1112065,10}, {1113089,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,10}, {1375233,12}, {1437697,11}, {1505281,10}, {1506305,12}, {1515521,13}, {1523713,12}, {1527809,13}, {1540097,12}, {1544193,13}, {1548289,12}, {1568769,11}, {1636353,10}, {1637377,12}, {1699841,11}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {273,5}, {289,4}, {305,5}, {673,6}, {705,5}, {737,6}, {769,5}, {801,6}, {1345,7}, {1409,6}, {1537,7}, {1665,6}, {1729,7}, {2689,8}, {2817,7}, {3201,8}, {3329,7}, {3713,8}, {3841,7}, {4225,8}, {4865,7}, {4993,9}, {5121,8}, {6657,9}, {7681,8}, {8961,9}, {11777,8}, {12033,10}, {12289,8}, {12545,9}, {13825,10}, {14337,9}, {14849,10}, {15361,9}, {19969,10}, {23553,9}, {24577,11}, {30721,10}, {31745,9}, {32257,10}, {37889,9}, {38401,10}, {39937,9}, {40449,10}, {48129,11}, {63489,10}, {80897,11}, {96257,12}, {126977,11}, {129025,10}, {130049,11}, {194561,12}, {208897,11}, {210945,12}, {258049,11}, {260097,9}, {269313,10}, {277505,9}, {278017,11}, {278529,10}, {280577,11}, {282625,10}, {283649,11}, {284673,10}, {285697,11}, {286721,10}, {289793,11}, {290817,10}, {293889,9}, {294401,10}, {310273,9}, {310785,8}, {311041,10}, {311297,11}, {315393,10}, {321537,12}, {323585,11}, {325633,10}, {326657,12}, {331777,10}, {332801,12}, {389121,10}, {392193,9}, {392705,10}, {413697,9}, {414209,10}, {418817,9}, {419841,10}, {424961,9}, {425473,10}, {441345,9}, {441857,10}, {449537,9}, {450561,10}, {452609,9}, {453121,10}, {454657,9}, {455169,10}, {490497,12}, {491521,13}, {516097,12}, {520193,10}, {523265,11}, {555009,10}, {556033,11}, {587777,10}, {588801,11}, {620545,10}, {621569,9}, {622081,11}, {624641,12}, {626689,11}, {653313,10}, {654337,11}, {686081,10}, {687105,11}, {718849,10}, {720897,11}, {722945,10}, {724993,11}, {729089,10}, {734209,11}, {737281,10}, {744449,11}, {745473,10}, {747521,11}, {749569,10}, {752641,11}, {784385,10}, {785409,11}, {808961,10}, {809985,11}, {817153,10}, {818177,11}, {849921,10}, {850945,11}, {851969,10}, {852993,11}, {858113,10}, {859137,11}, {860161,10}, {861185,11}, {882689,10}, {883713,11}, {980993,13}, {1040385,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,10}, {1375233,12}, {1437697,11}, {1505281,10}, {1506305,12}, {1568769,11}, {1636353,10}, {1637377,12}, {MP_SIZE_T_MAX,0}}
diff --git a/gmp/mpn/x86/p6/mmx/lshift.asm b/gmp/mpn/x86/p6/mmx/lshift.asm
index febd1c0e6c..e325b67d64 100644
--- a/gmp/mpn/x86/p6/mmx/lshift.asm
+++ b/gmp/mpn/x86/p6/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-II mpn_lshift -- mpn left shift.
dnl Copyright 2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
diff --git a/gmp/mpn/x86/p6/mmx/popham.asm b/gmp/mpn/x86/p6/mmx/popham.asm
index fd340e4b45..421daa5308 100644
--- a/gmp/mpn/x86/p6/mmx/popham.asm
+++ b/gmp/mpn/x86/p6/mmx/popham.asm
@@ -2,32 +2,21 @@ dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
dnl hamming distance.
dnl Copyright 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/mmx/rshift.asm b/gmp/mpn/x86/p6/mmx/rshift.asm
index 77aa1909fa..b1543cdf52 100644
--- a/gmp/mpn/x86/p6/mmx/rshift.asm
+++ b/gmp/mpn/x86/p6/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-II mpn_rshift -- mpn left shift.
dnl Copyright 2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
diff --git a/gmp/mpn/x86/p6/mod_1.asm b/gmp/mpn/x86/p6/mod_1.asm
new file mode 100644
index 0000000000..b6eacf7e82
--- /dev/null
+++ b/gmp/mpn/x86/p6/mod_1.asm
@@ -0,0 +1,472 @@
+dnl Intel P6 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 21.5 cycles/limb
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C The code here is in two parts, a simple divl loop and a mul-by-inverse.
+C The divl is used by mod_1 and mod_1c for small sizes, until the savings in
+C the mul-by-inverse can overcome the time to calculate an inverse.
+C preinv_mod_1 goes straight to the mul-by-inverse.
+C
+C The mul-by-inverse normalizes the divisor (or for preinv_mod_1 it's
+C already normalized). The calculation done is r=a%(d*2^n) followed by a
+C final (r*2^n)%(d*2^n), where a is the dividend, d the divisor, and n is
+C the number of leading zero bits on d. This means there's no bit shifts in
+C the main loop, at the cost of an extra divide step at the end.
+C
+C The simple divl for mod_1 is able to skip one divide step if high<divisor.
+C For mod_1c the carry parameter is the high of the first divide step, and
+C no attempt is make to skip that step since carry==0 will be very rare.
+C
+C The mul-by-inverse always skips one divide step, but then needs an extra
+C step at the end, unless the divisor was already normalized (n==0). This
+C leads to different mul-by-inverse thresholds for normalized and
+C unnormalized divisors, in mod_1 and mod_1c.
+C
+C Alternatives:
+C
+C If n is small then the extra divide step could be done by a few shift and
+C trial subtract steps instead of a full divide. That would probably be 3
+C or 4 cycles/bit, so say up to n=8 might benefit from that over a 21 cycle
+C divide. However it's considered that small divisors, meaning biggish n,
+C are more likely than small n, and that it's not worth the branch
+C mispredicts of a loop.
+C
+C Past:
+C
+C There used to be some MMX based code for P-II and P-III, roughly following
+C the K7 form, but it was slower (about 24.0 c/l) than the code here. That
+C code did have an advantage that mod_1 was able to do one less divide step
+C when high<divisor and the divisor unnormalized, but the speed advantage of
+C the current code soon overcomes that.
+C
+C Future:
+C
+C It's not clear whether what's here is optimal. A rough count of micro-ops
+C on the dependent chain would suggest a couple of cycles could be shaved,
+C perhaps.
+
+
+dnl The following thresholds are the sizes where the multiply by inverse
+dnl method is used instead of plain divl's. Minimum value 2 each.
+dnl
+dnl MUL_NORM_THRESHOLD is for normalized divisors (high bit set),
+dnl MUL_UNNORM_THRESHOLD for unnormalized divisors.
+dnl
+dnl With the divl loop at 39 c/l, and the inverse loop at 21.5 c/l but
+dnl setups for the inverse of about 50, the threshold should be around
+dnl 50/(39-21.5)==2.85. An unnormalized divisor gets an extra divide step
+dnl at the end, so if that's about 25 cycles then that threshold might be
+dnl around (50+25)/(39-21.5) == 4.3.
+
+deflit(MUL_NORM_THRESHOLD, 4)
+deflit(MUL_UNNORM_THRESHOLD, 5)
+
+deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
+
+
+defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR, 12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %edx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_INVERSE, %eax
+
+ movl %edi, SAVE_EDI
+ movl -4(%edx,%ebx,4), %edi C src high limb
+
+ movl $0, VAR_NORM
+ leal -8(%edx,%ebx,4), %ecx C &src[size-2]
+
+ C
+
+ movl %edi, %esi
+ subl %ebp, %edi C high-divisor
+
+ cmovc( %esi, %edi) C restore if underflow
+ decl %ebx
+ jnz L(preinv_entry)
+
+ jmp L(done_edi)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %eax
+
+ movl %esi, SAVE_ESI
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SRC, %esi
+ orl %ecx, %ecx
+ jz L(done_edx) C result==carry if size==0
+
+ sarl $31, %eax
+ movl PARAM_DIVISOR, %ebp
+
+ andl $MUL_NORM_DELTA, %eax
+
+ addl $MUL_UNNORM_THRESHOLD, %eax
+
+ cmpl %eax, %ecx
+ jb L(divide_top)
+
+
+ C The carry parameter pretends to be the src high limb.
+
+ movl %ebx, SAVE_EBX
+ leal 1(%ecx), %ebx C size+1
+
+ movl %edx, %eax C carry
+ jmp L(mul_by_inverse_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+ movl $0, %edx C initial carry (if can't skip a div)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %eax
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx, %ecx
+ jz L(done_edx)
+
+ movl -4(%eax,%ecx,4), %eax C src high limb
+
+ sarl $31, %ebp
+
+ andl $MUL_NORM_DELTA, %ebp
+
+ addl $MUL_UNNORM_THRESHOLD, %ebp
+ cmpl %esi, %eax C carry flag if high<divisor
+
+ cmovc( %eax, %edx) C src high limb as initial carry
+ movl PARAM_SRC, %esi
+
+ sbbl $0, %ecx C size-1 to skip one div
+ jz L(done_eax) C done if had size==1
+
+ cmpl %ebp, %ecx
+ movl PARAM_DIVISOR, %ebp
+ jae L(mul_by_inverse)
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+L(done_edx):
+ movl %edx, %eax
+L(done_eax):
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax src high limb
+ C ebx
+ C ecx
+ C edx
+ C esi src
+ C edi
+ C ebp divisor
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+L(mul_by_inverse_1c):
+ bsrl %ebp, %ecx C 31-l
+
+ movl %edi, SAVE_EDI
+ xorl $31, %ecx C l
+
+ movl %ecx, VAR_NORM
+ shll %cl, %ebp C d normalized
+
+ movl %eax, %edi C src high -> n2
+ subl %ebp, %eax
+
+ cmovnc( %eax, %edi) C n2-divisor if no underflow
+
+ movl $-1, %eax
+ movl $-1, %edx
+
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+ leal -8(%esi,%ebx,4), %ecx C &src[size-2]
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(preinv_entry):
+ movl %eax, VAR_INVERSE
+
+
+
+C No special scheduling of loads is necessary in this loop, out of order
+C execution hides the latencies already.
+C
+C The way q1+1 is generated in %ebx and d is moved to %eax for the multiply
+C seems fastest. The obvious change to generate q1+1 in %eax and then just
+C multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle
+C slower, for no obvious reason.
+
+
+ ALIGN(16)
+L(inverse_top):
+ C eax n10 (then scratch)
+ C ebx scratch (nadj, q1)
+ C ecx src pointer, decrementing
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl (%ecx), %eax C next src limb
+ movl %eax, %esi
+
+ sarl $31, %eax C -n1
+ movl %ebp, %ebx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ subl $4, %ecx
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ subl %eax, %esi C low n - (q1+1)*d
+
+ sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1
+
+ andl %ebp, %edi C d if underflow
+
+ addl %esi, %edi C remainder with addback if necessary
+
+ cmpl PARAM_SRC, %ecx
+ jae L(inverse_top)
+
+
+C -----------------------------------------------------------------------------
+L(inverse_loop_done):
+
+ C %edi is the remainder modulo d*2^n and now must be reduced to
+ C 0<=r<d by calculating r*2^n mod d*2^n and then right shifting by
+ C n. If d was already normalized on entry so that n==0 then nothing
+ C is needed here. The chance of n==0 is low, but it's true of say
+ C PP from gmp-impl.h.
+ C
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi remainder
+ C ebp divisor (normalized)
+
+ movl VAR_NORM, %ecx
+ movl $0, %esi
+
+ orl %ecx, %ecx
+ jz L(done_edi)
+
+
+ C Here use %edi=n10 and %esi=n2, opposite to the loop above.
+ C
+ C The q1=0xFFFFFFFF case is handled with an sbbl to adjust q1+1
+ C back, rather than q1_ff special case code. This is simpler and
+ C costs only 2 uops.
+
+ shldl( %cl, %edi, %esi)
+
+ shll %cl, %edi
+
+ movl %edi, %eax C n10
+ movl %ebp, %ebx C d
+
+ sarl $31, %eax C -n1
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %esi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%esi), %ebx C n2+1
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+ movl %ebp, %eax C d
+
+ mull %ebx C (q1+1)*d
+
+ movl SAVE_EBX, %ebx
+
+ C
+
+ subl %eax, %edi C low n - (q1+1)*d is remainder
+
+ sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
+
+ andl %ebp, %esi
+ movl SAVE_EBP, %ebp
+
+ leal (%esi,%edi), %eax C remainder
+ movl SAVE_ESI, %esi
+
+ shrl %cl, %eax C denorm remainder
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(done_edi):
+ movl SAVE_EBX, %ebx
+ movl %edi, %eax
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d.
+C
+C This is reached only very rarely.
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx src pointer
+ C edx
+ C esi n10
+ C edi (n2)
+ C ebp divisor
+
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+
+ cmpl PARAM_SRC, %ecx
+ jae L(inverse_top)
+
+ jmp L(inverse_loop_done)
+
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/p6/mod_34lsub1.asm b/gmp/mpn/x86/p6/mod_34lsub1.asm
index b88ab5d17c..5e854b7274 100644
--- a/gmp/mpn/x86/p6/mod_34lsub1.asm
+++ b/gmp/mpn/x86/p6/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/mode1o.asm b/gmp/mpn/x86/p6/mode1o.asm
index c62b676e5a..4aff48d7e6 100644
--- a/gmp/mpn/x86/p6/mode1o.asm
+++ b/gmp/mpn/x86/p6/mode1o.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -112,7 +101,7 @@ ifdef(`PIC',`
subl %eax, %edi C inv = 2*inv - inv*inv*d
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
movl PARAM_DIVISOR, %eax
imull %edi, %eax
cmpl $1, %eax')
@@ -124,7 +113,7 @@ C subl %edx, %eax 1
C imull %edi, %eax 4
C mull PARAM_DIVISOR 5
C ----
-C total 10
+C total 10
C
C and this is the measured speed. No special scheduling is necessary, out
C of order execution hides the load latency.
diff --git a/gmp/mpn/x86/p6/mul_basecase.asm b/gmp/mpn/x86/p6/mul_basecase.asm
index d87bc12b60..fc1afbdf0e 100644
--- a/gmp/mpn/x86/p6/mul_basecase.asm
+++ b/gmp/mpn/x86/p6/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
-dnl Copyright 1999-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/p3mmx/popham.asm b/gmp/mpn/x86/p6/p3mmx/popham.asm
index db2f2601c9..2f58968a31 100644
--- a/gmp/mpn/x86/p6/p3mmx/popham.asm
+++ b/gmp/mpn/x86/p6/p3mmx/popham.asm
@@ -2,32 +2,21 @@ dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
dnl hamming distance.
dnl Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sqr_basecase.asm b/gmp/mpn/x86/p6/sqr_basecase.asm
index 8fc7fdf375..05a31f1a15 100644
--- a/gmp/mpn/x86/p6/sqr_basecase.asm
+++ b/gmp/mpn/x86/p6/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -38,15 +27,15 @@ C which is the Karatsuba recursing range).
dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
dnl a description. The only difference here is that UNROLL_COUNT can go up
-dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
+dnl to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.
-deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
+deflit(SQR_KARATSUBA_THRESHOLD_MAX, 67)
-ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
-`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
+`define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
-m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
-deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
diff --git a/gmp/mpn/x86/p6/sse2/addmul_1.asm b/gmp/mpn/x86/p6/sse2/addmul_1.asm
index 144b627aa3..b601c54bcf 100644
--- a/gmp/mpn/x86/p6/sse2/addmul_1.asm
+++ b/gmp/mpn/x86/p6/sse2/addmul_1.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_addmul_1.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sse2/gmp-mparam.h b/gmp/mpn/x86/p6/sse2/gmp-mparam.h
index 69226289a7..843227b99a 100644
--- a/gmp/mpn/x86/p6/sse2/gmp-mparam.h
+++ b/gmp/mpn/x86/p6/sse2/gmp-mparam.h
@@ -1,197 +1,74 @@
/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2008, 2009
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-or both in parallel, as here.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+/* NOTE: In a fat binary build SQR_KARATSUBA_THRESHOLD here cannot be more
+ than the value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard
+ limit in mpn/x86/p6/sqr_basecase.asm. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+/* 1867 MHz P6 model 13 */
+/* Generated by tuneupc, 2008-10-30, gcc 4.3 */
-/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
- value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
- mpn/x86/p6/sqr_basecase.asm. */
+#define MUL_KARATSUBA_THRESHOLD 20
+#define MUL_TOOM3_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 142
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
-/* 1867 MHz P6 model 13 */
+#define MULLOW_BASECASE_THRESHOLD 4
+#define MULLOW_DC_THRESHOLD 38
+#define MULLOW_MUL_N_THRESHOLD 234
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 24
+#define POWM_THRESHOLD 150
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD_THRESHOLD 95
+#define GCD_DC_THRESHOLD 381
+#define GCDEXT_DC_THRESHOLD 419
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 276
+#define SET_STR_PRECOMPUTE_THRESHOLD 1078
+
+#define MUL_FFT_TABLE { 400, 928, 1664, 3584, 10240, 40960, 98304, 393216, 1572864, 0 }
+#define MUL_FFT_MODF_THRESHOLD 496
+#define MUL_FFT_THRESHOLD 7168
-#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 4
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 21
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 77
-#define MUL_TOOM44_THRESHOLD 169
-#define MUL_TOOM6H_THRESHOLD 246
-#define MUL_TOOM8H_THRESHOLD 381
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 101
-#define SQR_TOOM4_THRESHOLD 154
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 527
-
-#define MULMID_TOOM42_THRESHOLD 58
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
- { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
- { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
- { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
- { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
- { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
- { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
- { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
- { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
- { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
- { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
- { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
- { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
- { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
- { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
- { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
- { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 132
-#define MUL_FFT_THRESHOLD 7424
-
-#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
- { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
- { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
- { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
- { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
- { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
- { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
- { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
- { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
- { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
- { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
- { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
- { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
- { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
- { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
- { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
- { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
- { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
- { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
- { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
- { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
- { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
- { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
- { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 146
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 31
-#define MULLO_MUL_N_THRESHOLD 13463
-
-#define DC_DIV_QR_THRESHOLD 25
-#define DC_DIVAPPR_Q_THRESHOLD 55
-#define DC_BDIV_QR_THRESHOLD 60
-#define DC_BDIV_Q_THRESHOLD 132
-
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 65
-#define INV_APPR_THRESHOLD 65
-
-#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_N_THRESHOLD 62
-
-#define MU_DIV_QR_THRESHOLD 1164
-#define MU_DIVAPPR_Q_THRESHOLD 748
-#define MUPI_DIV_QR_THRESHOLD 38
-#define MU_BDIV_QR_THRESHOLD 1360
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 2,23,258,879,2246
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 69
-#define HGCD_APPR_THRESHOLD 112
-#define HGCD_REDUCE_THRESHOLD 3389
-#define GCD_DC_THRESHOLD 386
-#define GCDEXT_DC_THRESHOLD 303
-#define JACOBI_BASE_METHOD 1
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 582
-#define SET_STR_PRECOMPUTE_THRESHOLD 1118
-
-#define FAC_DSC_THRESHOLD 178
-#define FAC_ODD_THRESHOLD 34
+#define SQR_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 98304, 393216, 1572864, 0 }
+#define SQR_FFT_MODF_THRESHOLD 448
+#define SQR_FFT_THRESHOLD 3840
diff --git a/gmp/mpn/x86/p6/sse2/mod_1_1.asm b/gmp/mpn/x86/p6/sse2/mod_1_1.asm
deleted file mode 100644
index 8b7b7adaa5..0000000000
--- a/gmp/mpn/x86/p6/sse2/mod_1_1.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel P6/SSE2 mpn_mod_1_1.
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_mod_1_1p)
-include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/gmp/mpn/x86/p6/sse2/mod_1_4.asm b/gmp/mpn/x86/p6/sse2/mod_1_4.asm
deleted file mode 100644
index 49c96c60b9..0000000000
--- a/gmp/mpn/x86/p6/sse2/mod_1_4.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl Intel P6/SSE2 mpn_mod_1_4.
-
-dnl Copyright 2009, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_mod_1s_4p)
-include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/gmp/mpn/x86/p6/sse2/mul_1.asm b/gmp/mpn/x86/p6/sse2/mul_1.asm
index 50e5b6983a..fc3d4e6414 100644
--- a/gmp/mpn/x86/p6/sse2/mul_1.asm
+++ b/gmp/mpn/x86/p6/sse2/mul_1.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_mul_1.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sse2/mul_basecase.asm b/gmp/mpn/x86/p6/sse2/mul_basecase.asm
index 4687625790..f52ece025f 100644
--- a/gmp/mpn/x86/p6/sse2/mul_basecase.asm
+++ b/gmp/mpn/x86/p6/sse2/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_mul_basecase.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sse2/popcount.asm b/gmp/mpn/x86/p6/sse2/popcount.asm
index 4c02b93be2..f818d6e230 100644
--- a/gmp/mpn/x86/p6/sse2/popcount.asm
+++ b/gmp/mpn/x86/p6/sse2/popcount.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_popcount -- population count.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sse2/sqr_basecase.asm b/gmp/mpn/x86/p6/sse2/sqr_basecase.asm
index 76b574b6c7..8a7f24974d 100644
--- a/gmp/mpn/x86/p6/sse2/sqr_basecase.asm
+++ b/gmp/mpn/x86/p6/sse2/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_sqr_basecase.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/p6/sse2/submul_1.asm b/gmp/mpn/x86/p6/sse2/submul_1.asm
index 98a603ce93..ae97fd6346 100644
--- a/gmp/mpn/x86/p6/sse2/submul_1.asm
+++ b/gmp/mpn/x86/p6/sse2/submul_1.asm
@@ -1,32 +1,21 @@
dnl Intel P6/SSE2 mpn_submul_1.
dnl Copyright 2008 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/README b/gmp/mpn/x86/pentium/README
index 305936bbd9..6c4d872c47 100644
--- a/gmp/mpn/x86/pentium/README
+++ b/gmp/mpn/x86/pentium/README
@@ -1,30 +1,19 @@
-Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+Copyright 1996, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/pentium/aors_n.asm b/gmp/mpn/x86/pentium/aors_n.asm
index 01ebfb96ae..30d0df79b0 100644
--- a/gmp/mpn/x86/pentium/aors_n.asm
+++ b/gmp/mpn/x86/pentium/aors_n.asm
@@ -1,32 +1,22 @@
dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -88,13 +78,13 @@ deflit(`FRAME',16)
pushl %edx
FRAME_pushl()
movl PARAM_CARRY,%eax
- shrl %eax C shift bit 0 into carry
+ shrl $1,%eax C shift bit 0 into carry
jmp L(oop)
L(endgo):
deflit(`FRAME',16)
movl PARAM_CARRY,%eax
- shrl %eax C shift bit 0 into carry
+ shrl $1,%eax C shift bit 0 into carry
jmp L(end)
EPILOGUE()
diff --git a/gmp/mpn/x86/pentium/aorsmul_1.asm b/gmp/mpn/x86/pentium/aorsmul_1.asm
index d83cc4513b..a50299b5cf 100644
--- a/gmp/mpn/x86/pentium/aorsmul_1.asm
+++ b/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -2,32 +2,21 @@ dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/bdiv_q_1.asm b/gmp/mpn/x86/pentium/bdiv_q_1.asm
deleted file mode 100644
index 9fee3cb87a..0000000000
--- a/gmp/mpn/x86/pentium/bdiv_q_1.asm
+++ /dev/null
@@ -1,260 +0,0 @@
-dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C divisor
-C odd even
-C P54: 24.5 30.5 cycles/limb
-C P55: 23.0 28.0
-
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
-
-C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
-C expected. On P54 in the even case the shrdl pairing nonsense (see
-C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
-C further 1.5 slowdown for both odd and even.
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(VAR_INVERSE,`PARAM_DST')
-
- TEXT
-
- ALIGN(32)
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl $-1, %ecx
- movl PARAM_DIVISOR, %eax
-
-L(strip_twos):
- ASSERT(nz, `orl %eax, %eax')
- shrl %eax
- incl %ecx C shift count
-
- jnc L(strip_twos)
-
- leal 1(%eax,%eax), %edx C d
- andl $127, %eax C d/2, 7 bits
-
- pushl %ebx FRAME_pushl()
- pushl %ebp FRAME_pushl()
-
-ifdef(`PIC',`
- call L(here)
-L(here):
- popl %ebp C eip
-
- addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
- C AGI
- movl binvert_limb_table@GOT(%ebp), %ebp
- C AGI
- movzbl (%eax,%ebp), %eax
-',`
-
-dnl non-PIC
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- movl %eax, %ebp C inv
- addl %eax, %eax C 2*inv
-
- imull %ebp, %ebp C inv*inv
-
- imull %edx, %ebp C inv*inv*d
-
- subl %ebp, %eax C inv = 2*inv - inv*inv*d
- movl PARAM_SIZE, %ebx
-
- movl %eax, %ebp
- addl %eax, %eax C 2*inv
-
- imull %ebp, %ebp C inv*inv
-
- imull %edx, %ebp C inv*inv*d
-
- subl %ebp, %eax C inv = 2*inv - inv*inv*d
- movl %edx, PARAM_DIVISOR C d without twos
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- imull PARAM_DIVISOR, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- jmp L(common)
-EPILOGUE()
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
- ALIGN(32)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SHIFT, %ecx
-
- pushl %ebx FRAME_pushl()
- pushl %ebp FRAME_pushl()
-
- movl PARAM_SIZE, %ebx
- movl PARAM_INVERSE, %eax
-
-L(common):
- pushl %esi FRAME_pushl()
- push %edi FRAME_pushl()
-
- movl PARAM_SRC, %esi
- movl PARAM_DST, %edi
- movl %eax, VAR_INVERSE
-
- leal (%esi,%ebx,4), %esi C src end
- leal (%edi,%ebx,4), %edi C dst end
-
- negl %ebx C -size
-
- xorl %ebp, %ebp C initial carry bit
-
- orl %ecx, %ecx C shift
- movl (%esi,%ebx,4), %eax C src low limb
- jz L(odd_entry)
-
- xorl %edx, %edx C initial carry limb (for even, if one)
- incl %ebx
- jz L(one)
-
- movl (%esi,%ebx,4), %edx C src second limb (for even)
- shrdl( %cl, %edx, %eax)
-
- jmp L(even_entry)
-
-
- ALIGN(8)
-L(odd_top):
- C eax scratch
- C ebx counter, limbs, negative
- C ecx
- C edx
- C esi src end
- C edi dst end
- C ebp carry bit, 0 or -1
-
- mull PARAM_DIVISOR
-
- movl (%esi,%ebx,4), %eax
- subl %ebp, %edx
-
- subl %edx, %eax
-
- sbbl %ebp, %ebp
-
-L(odd_entry):
- imull VAR_INVERSE, %eax
-
- movl %eax, (%edi,%ebx,4)
-
- incl %ebx
- jnz L(odd_top)
-
- popl %edi
- popl %esi
-
- popl %ebp
- popl %ebx
-
- ret
-
-L(even_top):
- C eax scratch
- C ebx counter, limbs, negative
- C ecx twos
- C edx
- C esi src end
- C edi dst end
- C ebp carry bit, 0 or -1
-
- mull PARAM_DIVISOR
-
- subl %ebp, %edx C carry bit
- movl -4(%esi,%ebx,4), %eax C src limb
-
- movl (%esi,%ebx,4), %ebp C and one above it
-
- shrdl( %cl, %ebp, %eax)
-
- subl %edx, %eax C carry limb
-
- sbbl %ebp, %ebp
-
-L(even_entry):
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi,%ebx,4)
- incl %ebx
-
- jnz L(even_top)
-
- mull PARAM_DIVISOR
-
- movl -4(%esi), %eax C src high limb
- subl %ebp, %edx
-
-L(one):
- shrl %cl, %eax
-
- subl %edx, %eax C no carry if division is exact
-
- imull VAR_INVERSE, %eax
-
- movl %eax, -4(%edi) C dst high limb
- nop C protect against cache bank clash
-
- popl %edi
- popl %esi
-
- popl %ebp
- popl %ebx
-
- ret
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium/com.asm b/gmp/mpn/x86/pentium/com_n.asm
index b0805452a6..c6d2d72e5e 100644
--- a/gmp/mpn/x86/pentium/com.asm
+++ b/gmp/mpn/x86/pentium/com_n.asm
@@ -1,32 +1,21 @@
-dnl Intel Pentium mpn_com -- mpn ones complement.
+dnl Intel Pentium mpn_com_n -- mpn ones complement.
dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -37,7 +26,7 @@ C P5: 1.75 cycles/limb
NAILS_SUPPORT(0-31)
-C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
C
C This code is similar to mpn_copyi, basically there's just some "xorl
C $GMP_NUMB_MASK"s inserted.
@@ -55,7 +44,7 @@ defframe(PARAM_DST, 4)
TEXT
ALIGN(8)
-PROLOGUE(mpn_com)
+PROLOGUE(mpn_com_n)
deflit(`FRAME',0)
movl PARAM_SRC, %eax
diff --git a/gmp/mpn/x86/pentium/copyd.asm b/gmp/mpn/x86/pentium/copyd.asm
index 72a543b2a3..2be8c765ac 100644
--- a/gmp/mpn/x86/pentium/copyd.asm
+++ b/gmp/mpn/x86/pentium/copyd.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing.
dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/copyi.asm b/gmp/mpn/x86/pentium/copyi.asm
index d983d6b46e..9da08e2c06 100644
--- a/gmp/mpn/x86/pentium/copyi.asm
+++ b/gmp/mpn/x86/pentium/copyi.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/dive_1.asm b/gmp/mpn/x86/pentium/dive_1.asm
index f80632f479..79885244a5 100644
--- a/gmp/mpn/x86/pentium/dive_1.asm
+++ b/gmp/mpn/x86/pentium/dive_1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -157,7 +146,7 @@ dnl non-PIC
negl %ebx C -size
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
imull PARAM_DIVISOR, %eax
cmpl $1, %eax
diff --git a/gmp/mpn/x86/pentium/gmp-mparam.h b/gmp/mpn/x86/pentium/gmp-mparam.h
index befa6e27a9..5c49c4e3cb 100644
--- a/gmp/mpn/x86/pentium/gmp-mparam.h
+++ b/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -1,36 +1,26 @@
/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004 Free Software
+Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* For mpn/x86/pentium/mod_1.asm */
@@ -41,11 +31,11 @@ see https://www.gnu.org/licenses/. */
/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 90
+#define MUL_KARATSUBA_THRESHOLD 16
+#define MUL_TOOM3_THRESHOLD 90
#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 22
+#define SQR_KARATSUBA_THRESHOLD 22
#define SQR_TOOM3_THRESHOLD 122
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/gmp/mpn/x86/pentium/hamdist.asm b/gmp/mpn/x86/pentium/hamdist.asm
index 2d7bc99b12..a129030f74 100644
--- a/gmp/mpn/x86/pentium/hamdist.asm
+++ b/gmp/mpn/x86/pentium/hamdist.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_hamdist -- mpn hamming distance.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/logops_n.asm b/gmp/mpn/x86/pentium/logops_n.asm
index 18773172e9..0552e55809 100644
--- a/gmp/mpn/x86/pentium/logops_n.asm
+++ b/gmp/mpn/x86/pentium/logops_n.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/lshift.asm b/gmp/mpn/x86/pentium/lshift.asm
index 2a31f36c6e..ece51e06d3 100644
--- a/gmp/mpn/x86/pentium/lshift.asm
+++ b/gmp/mpn/x86/pentium/lshift.asm
@@ -1,32 +1,22 @@
dnl Intel Pentium mpn_lshift -- mpn left shift.
-dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
index 02a0def127..e443c8c300 100644
--- a/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
+++ b/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -1,37 +1,26 @@
/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2004, 2009 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
/* For mpn/x86/pentium/mod_1.asm */
@@ -40,124 +29,45 @@ see https://www.gnu.org/licenses/. */
/* 233MHz P55 */
-#define MOD_1_NORM_THRESHOLD 5
-#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63
-#define USE_PREINV_DIVREM_1 0
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 51
-
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 53
-#define MUL_TOOM44_THRESHOLD 128
-#define MUL_TOOM6H_THRESHOLD 189
-#define MUL_TOOM8H_THRESHOLD 260
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 20
-#define SQR_TOOM3_THRESHOLD 73
-#define SQR_TOOM4_THRESHOLD 178
-#define SQR_TOOM6_THRESHOLD 210
-#define SQR_TOOM8_THRESHOLD 375
-
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 12
-
-#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
- { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
- { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
- { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \
- { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \
- { 543,10}, { 287,11}, { 159,10}, { 351,11}, \
- { 191,10}, { 415,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 287,10}, { 575,11}, \
- { 351,12}, { 191,11}, { 415,13}, { 127,12}, \
- { 255,11}, { 575,12}, { 319,11}, { 703,12}, \
- { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 90
-#define MUL_FFT_THRESHOLD 3520
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \
- { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
- { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
- { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
- { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
- { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
- { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
- { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
- { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
- { 351,11}, { 191,10}, { 415,11}, { 223,10}, \
- { 447,12}, { 127,11}, { 255,10}, { 543,11}, \
- { 287,10}, { 607,11}, { 351,12}, { 191,11}, \
- { 479,13}, { 127,12}, { 255,11}, { 575,12}, \
- { 319,11}, { 703,12}, { 383,11}, { 767,12}, \
- { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 96
-#define SQR_FFT_THRESHOLD 5504
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 48
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 43
-#define DC_DIVAPPR_Q_THRESHOLD 170
-#define DC_BDIV_QR_THRESHOLD 43
-#define DC_BDIV_Q_THRESHOLD 110
-
-#define INV_MULMOD_BNM1_THRESHOLD 30
-#define INV_NEWTON_THRESHOLD 177
-#define INV_APPR_THRESHOLD 171
-
-#define BINV_NEWTON_THRESHOLD 194
-#define REDC_1_TO_REDC_N_THRESHOLD 50
-
-#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 90
-#define MU_BDIV_QR_THRESHOLD 942
-#define MU_BDIV_Q_THRESHOLD 1017
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 92
-#define GCD_DC_THRESHOLD 283
-#define GCDEXT_DC_THRESHOLD 221
-#define JACOBI_BASE_METHOD 2
-
-#define GET_STR_DC_THRESHOLD 18
-#define GET_STR_PRECOMPUTE_THRESHOLD 31
-#define SET_STR_DC_THRESHOLD 490
-#define SET_STR_PRECOMPUTE_THRESHOLD 994
+/* Generated by tuneup.c, 2009-01-06, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 16
+#define MUL_TOOM3_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 131
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 77
+#define SQR_TOOM4_THRESHOLD 168
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 40
+#define MULLOW_MUL_N_THRESHOLD 266
+
+#define DIV_SB_PREINV_THRESHOLD 4
+#define DIV_DC_THRESHOLD 43
+#define POWM_THRESHOLD 64
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 95
+#define GCD_DC_THRESHOLD 316
+#define GCDEXT_DC_THRESHOLD 316
+#define JACOBI_BASE_METHOD 2
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 527
+#define SET_STR_PRECOMPUTE_THRESHOLD 1069
+
+#define MUL_FFT_TABLE { 304, 672, 1152, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 320
+#define MUL_FFT_THRESHOLD 3840
+
+#define SQR_FFT_TABLE { 304, 672, 1152, 4608, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD 320
+#define SQR_FFT_THRESHOLD 3840
diff --git a/gmp/mpn/x86/pentium/mmx/hamdist.asm b/gmp/mpn/x86/pentium/mmx/hamdist.asm
index 72e3196697..185eeaee22 100644
--- a/gmp/mpn/x86/pentium/mmx/hamdist.asm
+++ b/gmp/mpn/x86/pentium/mmx/hamdist.asm
@@ -1,32 +1,21 @@
dnl Intel P55 mpn_hamdist -- mpn hamming distance.
dnl Copyright 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mmx/lshift.asm b/gmp/mpn/x86/pentium/mmx/lshift.asm
index 04b0ddcc8f..012d794952 100644
--- a/gmp/mpn/x86/pentium/mmx/lshift.asm
+++ b/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_lshift -- mpn left shift.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mmx/mul_1.asm b/gmp/mpn/x86/pentium/mmx/mul_1.asm
index 4ced577b13..b9fe77ed07 100644
--- a/gmp/mpn/x86/pentium/mmx/mul_1.asm
+++ b/gmp/mpn/x86/pentium/mmx/mul_1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mmx/rshift.asm b/gmp/mpn/x86/pentium/mmx/rshift.asm
index e3b274bb63..f50b8ab0e0 100644
--- a/gmp/mpn/x86/pentium/mmx/rshift.asm
+++ b/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_rshift -- mpn right shift.
dnl Copyright 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mod_1.asm b/gmp/mpn/x86/pentium/mod_1.asm
new file mode 100644
index 0000000000..408242e7a9
--- /dev/null
+++ b/gmp/mpn/x86/pentium/mod_1.asm
@@ -0,0 +1,454 @@
+dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 28.0 cycles/limb
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
+C multiply by inverse without on-the-fly shifts. See that code for some
+C general comments.
+C
+C Alternatives:
+C
+C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
+C slower, probably more depending what it did to register usage. Using MMX
+C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
+C 3 cycles.
+
+
+dnl These thresholds are the sizes where the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
+dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.
+dnl
+dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
+dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is
+dnl what happens in practice.
+dnl
+dnl An unnormalized divisor gets an extra 40 cycles at the end for the
+dnl final (r*2^n)%(d*2^n) and shift. This increases the threshold by about
+dnl 40/16=3.
+dnl
+dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this
+dnl doesn't change the thresholds.
+dnl
+dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and
+dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
+dnl (branch free) and ensures the choice between div or mul is optimal.
+
+deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
+deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))
+
+deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))
+
+
+defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR, 12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-using parameter space
+define(VAR_NORM, `PARAM_DIVISOR')
+define(VAR_INVERSE, `PARAM_SIZE')
+
+ TEXT
+
+ ALIGN(8)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+ pushl %ebp FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %edx
+
+ pushl %edi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DIVISOR, %ebp
+ movl PARAM_INVERSE, %eax
+
+ movl -4(%esi,%edx,4), %edi C src high limb
+ leal -8(%esi,%edx,4), %esi C &src[size-2]
+
+ movl $0, VAR_NORM
+ decl %edx
+
+ jnz L(start_preinv)
+
+ subl %ebp, %edi C src-divisor
+ popl %ebx
+
+ sbbl %ecx, %ecx C -1 if underflow
+ movl %edi, %eax C src-divisor
+
+ andl %ebp, %ecx C d if underflow
+ popl %edi
+
+ addl %ecx, %eax C remainder, with possible addback
+ popl %esi
+
+ popl %ebp
+
+ ret
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ movl PARAM_SIZE, %ecx
+
+ sarl $31, %eax C d highbit
+ movl PARAM_CARRY, %edx
+
+ orl %ecx, %ecx
+ jz L(done_edx) C result==carry if size==0
+
+ andl $MUL_NORM_DELTA, %eax
+ pushl %ebp FRAME_pushl()
+
+ addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DIVISOR, %ebp
+
+ cmpl %eax, %ecx
+ jb L(divide_top)
+
+ movl %edx, %eax C carry as pretend src high limb
+ leal 1(%ecx), %edx C size+1
+
+ cmpl $0x1000000, %ebp
+ jmp L(mul_by_inverse_1c)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebp FRAME_pushl()
+
+ orl %ecx, %ecx
+ jz L(done_zero)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DIVISOR, %ebp
+
+ sarl $31, %ebp C -1 if divisor normalized
+ movl -4(%eax,%ecx,4), %eax C src high limb
+
+ movl PARAM_DIVISOR, %edx
+ pushl %esi FRAME_pushl()
+
+ andl $MUL_NORM_DELTA, %ebp
+ cmpl %edx, %eax C carry flag if high<divisor
+
+ sbbl %edx, %edx C -1 if high<divisor
+ addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh
+
+ addl %edx, %ecx C size-1 if high<divisor
+ jz L(done_eax)
+
+ cmpl %ebp, %ecx
+ movl PARAM_DIVISOR, %ebp
+
+ movl PARAM_SRC, %esi
+ jae L(mul_by_inverse)
+
+ andl %eax, %edx C high as initial carry if high<divisor
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx scratch (remainder)
+ C esi src
+ C edi
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+ popl %esi
+ popl %ebp
+
+L(done_edx):
+ movl %edx, %eax
+
+ ret
+
+
+L(done_zero):
+ xorl %eax, %eax
+ popl %ebp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C The divisor is normalized using the same code as the pentium
+C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a
+C couple of cycles, but is more or less unavoidable.
+
+
+ ALIGN(8)
+L(mul_by_inverse):
+ C eax src high limb
+ C ebx
+ C ecx size or size-1
+ C edx
+ C esi src
+ C edi
+ C ebp divisor
+
+ movl PARAM_SIZE, %edx
+ cmpl $0x1000000, %ebp
+
+L(mul_by_inverse_1c):
+ sbbl %ecx, %ecx
+ cmpl $0x10000, %ebp
+
+ sbbl $0, %ecx
+ cmpl $0x100, %ebp
+
+ sbbl $0, %ecx
+ pushl %edi FRAME_pushl()
+
+ pushl %ebx FRAME_pushl()
+ movl %ebp, %ebx C d
+
+ifdef(`PIC',`
+ call L(here)
+L(here):
+ popl %edi
+ leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
+
+ shrl %cl, %ebx
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi
+
+ C AGI
+ movl __clz_tab@GOT(%edi), %edi
+ addl $-34, %ecx
+
+ C AGI
+ movb (%ebx,%edi), %bl
+
+',`
+ leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1
+
+ shrl %cl, %ebx
+ addl $-34, %ecx
+
+ C AGI
+ movb __clz_tab(%ebx), %bl
+')
+ movl %eax, %edi C carry -> n1
+
+ addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1
+ leal -8(%esi,%edx,4), %esi C &src[size-2]
+
+ xorl $-1, %ecx C clz
+ movl $-1, %edx
+
+ ASSERT(e,`pushl %eax C clz calculation same as bsrl
+ bsrl %ebp, %eax
+ xorl $31, %eax
+ cmpl %eax, %ecx
+ popl %eax')
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1
+ movl $-1, %eax
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+ movl %eax, VAR_INVERSE
+ movl %ebp, %eax C d
+
+ movl %ecx, %edx C fake high, will cancel
+
+
+C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
+C high limb, and this may be greater than the divisor and may need one copy
+C of the divisor subtracted (only one, because the divisor is normalized).
+C This is accomplished by having the initial ecx:edi act as a fake previous
+C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is
+C subtracted from ecx:edi, with the usual addback if it produces an
+C underflow.
+
+
+L(inverse_top):
+ C eax scratch (n10, n1, q1, etc)
+ C ebx scratch (nadj, src limit)
+ C ecx old n2
+ C edx scratch
+ C esi src pointer, &src[size-2] to &src[0]
+ C edi old n10
+ C ebp d
+
+ subl %eax, %edi C low n - (q1+1)*d
+ movl (%esi), %eax C new n10
+
+ sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1
+ movl %ebp, %ebx C d
+
+ sarl $31, %eax C -n1
+ andl %ebp, %ecx C d if underflow
+
+ addl %edi, %ecx C remainder -> n2, and possible addback
+ ASSERT(b,`cmpl %ebp, %ecx')
+ andl %eax, %ebx C -n1 & d
+
+ movl (%esi), %edi C n10
+ andl $1, %eax C n1
+
+ addl %ecx, %eax C n2+n1
+ addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag
+ leal 1(%ecx), %eax C 1+n2
+
+ adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
+ movl PARAM_SRC, %ebx
+
+ sbbl $0, %eax C use q1 if q1+1 overflows
+ subl $4, %esi C step src ptr
+
+ mull %ebp C (q1+1)*d
+
+ cmpl %ebx, %esi
+ jae L(inverse_top)
+
+
+
+ C %edi (after subtract and addback) is the remainder modulo d*2^n
+ C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
+ C right shifting by n.
+ C
+ C If d was already normalized on entry so that n==0 then nothing is
+ C needed here. This is always the case for preinv_mod_1. For mod_1
+ C or mod_1c the chance of n==0 is low, but about 40 cycles can be
+ C saved.
+
+ subl %eax, %edi C low n - (q1+1)*d
+ movl %ecx, %ebx C n2
+
+ sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1
+ xorl %esi, %esi C next n2
+
+ andl %ebp, %ebx C d if underflow
+ movl VAR_NORM, %ecx
+
+ addl %ebx, %edi C remainder, with possible addback
+ orl %ecx, %ecx
+
+ jz L(done_mul_edi)
+
+
+ C Here using %esi=n2 and %edi=n10, unlike the above
+
+ shldl( %cl, %edi, %esi) C n2
+
+ shll %cl, %edi C n10
+
+ movl %edi, %eax C n10
+ movl %edi, %ebx C n10
+
+ sarl $31, %ebx C -n1
+
+ shrl $31, %eax C n1
+ andl %ebp, %ebx C -n1 & d
+
+ addl %esi, %eax C n2+n1
+ addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%esi), %eax C 1+n2
+
+ adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %eax C use q1 if q1+1 overflows
+
+ mull %ebp C (q1+1)*d
+
+ subl %eax, %edi C low n - (q1+1)*d
+ popl %ebx
+
+ sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1
+ movl %edi, %eax
+
+ andl %ebp, %esi C d if underflow
+ popl %edi
+
+ addl %esi, %eax C addback if underflow
+ popl %esi
+
+ shrl %cl, %eax C denorm remainder
+ popl %ebp
+
+ ret
+
+
+L(done_mul_edi):
+ movl %edi, %eax
+ popl %ebx
+
+ popl %edi
+L(done_eax):
+ popl %esi
+
+ popl %ebp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/pentium/mod_34lsub1.asm b/gmp/mpn/x86/pentium/mod_34lsub1.asm
index 2d88223b84..201081a437 100644
--- a/gmp/mpn/x86/pentium/mod_34lsub1.asm
+++ b/gmp/mpn/x86/pentium/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mode1o.asm b/gmp/mpn/x86/pentium/mode1o.asm
index eb2790e1a0..222f64e5cb 100644
--- a/gmp/mpn/x86/pentium/mode1o.asm
+++ b/gmp/mpn/x86/pentium/mode1o.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -133,7 +122,7 @@ dnl non-PIC
subl %eax, %ecx C inv = 2*inv - inv*inv*d
pushl %esi FRAME_pushl()
- ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
movl %ecx, %eax
imull PARAM_DIVISOR, %eax
cmpl $1, %eax')
diff --git a/gmp/mpn/x86/pentium/mul_1.asm b/gmp/mpn/x86/pentium/mul_1.asm
index a0858af2b4..c6b255c322 100644
--- a/gmp/mpn/x86/pentium/mul_1.asm
+++ b/gmp/mpn/x86/pentium/mul_1.asm
@@ -2,32 +2,21 @@ dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mul_2.asm b/gmp/mpn/x86/pentium/mul_2.asm
index 4c7beb5df2..36a025c425 100644
--- a/gmp/mpn/x86/pentium/mul_2.asm
+++ b/gmp/mpn/x86/pentium/mul_2.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/mul_basecase.asm b/gmp/mpn/x86/pentium/mul_basecase.asm
index 50e15d3567..fd24fdf7fa 100644
--- a/gmp/mpn/x86/pentium/mul_basecase.asm
+++ b/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
-dnl Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1996, 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/popcount.asm b/gmp/mpn/x86/pentium/popcount.asm
index b8d84ad2e2..df53bb8842 100644
--- a/gmp/mpn/x86/pentium/popcount.asm
+++ b/gmp/mpn/x86/pentium/popcount.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_popcount -- mpn bit population count.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/rshift.asm b/gmp/mpn/x86/pentium/rshift.asm
index 2105c4c935..949b0d2e2f 100644
--- a/gmp/mpn/x86/pentium/rshift.asm
+++ b/gmp/mpn/x86/pentium/rshift.asm
@@ -1,32 +1,22 @@
dnl Intel Pentium mpn_rshift -- mpn right shift.
-dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium/sqr_basecase.asm b/gmp/mpn/x86/pentium/sqr_basecase.asm
index b11d767da2..e4fca7c546 100644
--- a/gmp/mpn/x86/pentium/sqr_basecase.asm
+++ b/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
-dnl Copyright 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/README b/gmp/mpn/x86/pentium4/README
index 90f752e5d5..8dc0479f04 100644
--- a/gmp/mpn/x86/pentium4/README
+++ b/gmp/mpn/x86/pentium4/README
@@ -3,28 +3,17 @@ Copyright 2001 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86/pentium4/copyd.asm b/gmp/mpn/x86/pentium4/copyd.asm
index 82af81c522..491ad60128 100644
--- a/gmp/mpn/x86/pentium4/copyd.asm
+++ b/gmp/mpn/x86/pentium4/copyd.asm
@@ -1,32 +1,22 @@
dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing.
-
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its
diff --git a/gmp/mpn/x86/pentium4/copyi.asm b/gmp/mpn/x86/pentium4/copyi.asm
index b6148879fa..bf812c822b 100644
--- a/gmp/mpn/x86/pentium4/copyi.asm
+++ b/gmp/mpn/x86/pentium4/copyi.asm
@@ -1,32 +1,22 @@
dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing.
-
-dnl Copyright 1999-2001 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+
+dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl The rep/movsl is very slow for small blocks on pentium4. Its startup
diff --git a/gmp/mpn/x86/pentium4/mmx/lshift.asm b/gmp/mpn/x86/pentium4/mmx/lshift.asm
index b5eca66698..5d316d5da4 100644
--- a/gmp/mpn/x86/pentium4/mmx/lshift.asm
+++ b/gmp/mpn/x86/pentium4/mmx/lshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_lshift -- left shift.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/mmx/popham.asm b/gmp/mpn/x86/pentium4/mmx/popham.asm
index 9563cb57e4..2e79816821 100644
--- a/gmp/mpn/x86/pentium4/mmx/popham.asm
+++ b/gmp/mpn/x86/pentium4/mmx/popham.asm
@@ -1,33 +1,22 @@
dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
dnl hamming distance.
-dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/mmx/rshift.asm b/gmp/mpn/x86/pentium4/mmx/rshift.asm
index 3ac0094a5a..a7dec54a3a 100644
--- a/gmp/mpn/x86/pentium4/mmx/rshift.asm
+++ b/gmp/mpn/x86/pentium4/mmx/rshift.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_rshift -- right shift.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/add_n.asm b/gmp/mpn/x86/pentium4/sse2/add_n.asm
index 8e2380e493..04c0c68d0e 100644
--- a/gmp/mpn/x86/pentium4/sse2/add_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/add_n.asm
@@ -1,44 +1,36 @@
dnl Intel Pentium-4 mpn_add_n -- mpn addition.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4 6 6
-C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
+C 6.0 cycles/limb if dst==src1 or dst==src2
+C P4 Prescott: >= 5 cycles/limb
+
+C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_add_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C The 4 c/l achieved here isn't particularly good, but is better than 9 c/l
+C for a basic adc loop.
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
@@ -54,25 +46,29 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_add_nc)
deflit(`FRAME',0)
+
movd PARAM_CARRY, %mm0
jmp L(start_nc)
+
EPILOGUE()
ALIGN(8)
PROLOGUE(mpn_add_n)
deflit(`FRAME',0)
+
pxor %mm0, %mm0
+
L(start_nc):
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC2, %ebx
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%eax,%ecx,4), %eax C src1 end
+ leal (%ebx,%ecx,4), %ebx C src2 end
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -90,11 +86,12 @@ L(top):
psrlq $32, %mm0
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
+
movd %mm0, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
index 93b63b2018..46b0903c50 100644
--- a/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
@@ -1,45 +1,33 @@
dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
-dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
+dnl Copyright 2001, 2002, 2003, 2004, 2006 Free Software Foundation, Inc.
dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4.25 6 6
-C P4 model 3-4 (Prescott) 5 8.5 8.5
+C cycles/limb (approx)
+C dst!=src1,2 dst==src1 dst==src2
+C P4 m2: 4.5 ?7.25 ?6.75
+C P4 m3: 5.3 ? ?
+C mp_limb_t mpn_addlsh1_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
C The slightly strange combination of indexing and pointer incrementing
C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or
C src2 is a slowdown.
@@ -63,18 +51,18 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_addlsh1_n)
deflit(`FRAME',0)
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
+ movl PARAM_SRC2, %ebx
pxor %mm0, %mm0 C initial carry
- mov PARAM_DST, %edx
+ movl PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SIZE, %ecx
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -83,24 +71,24 @@ L(top):
C edx dst end
C mm0 carry
- movd (%ebx), %mm2
movd (%eax), %mm1
+ movd (%ebx), %mm2
psrlq $32, %mm0
- lea 4(%eax), %eax
- lea 4(%ebx), %ebx
+ leal 4(%eax), %eax
+ leal 4(%ebx), %ebx
- psllq $1, %mm2
+ paddq %mm2, %mm1
paddq %mm2, %mm1
paddq %mm1, %mm0
movd %mm0, (%edx,%ecx,4)
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
psrlq $32, %mm0
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
movd %mm0, %eax
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
index 78102072bf..3a8d0bb9bd 100644
--- a/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
@@ -1,48 +1,37 @@
dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2005, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 5.24
-C P6 model 13 (Dothan) 5.24
-C P4 model 0-1 (Willamette) 5
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5
-
C TODO:
C * Tweak eax/edx offsets in loop as to save some lea's
C * Perhaps software pipeline small-case code
+C cycles/limb
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.24
+C P4 model 0-1 (Willamette): 5
+C P4 model 2 (Northwood): 5
+C P4 model 3-4 (Prescott): 5
+
C INPUT PARAMETERS
C rp sp + 4
C up sp + 8
@@ -51,13 +40,22 @@ C v0 sp + 16
TEXT
ALIGN(16)
+PROLOGUE(mpn_addmul_1c)
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
PROLOGUE(mpn_addmul_1)
- pxor %mm6, %mm6
-L(ent): mov 4(%esp), %edx
+ mov 4(%esp), %edx
mov 8(%esp), %eax
mov 12(%esp), %ecx
movd 16(%esp), %mm7
- cmp $4, %ecx
+ pxor %mm6, %mm6
+L(ent): cmp $4, %ecx
jnc L(big)
L(lp0): movd (%eax), %mm0
@@ -183,7 +181,3 @@ L(end): pmuludq %mm7, %mm2
emms
ret
EPILOGUE()
-PROLOGUE(mpn_addmul_1c)
- movd 20(%esp), %mm6
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
deleted file mode 100644
index 354300e4de..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
+++ /dev/null
@@ -1,141 +0,0 @@
-dnl Intel Atom mpn_bdiv_dbm1.
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 9.75
-C P6 model 13 (Dothan)
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood) 8.25
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 8
-C AMD K6 -
-C AMD K7 -
-C AMD K8
-C AMD K10
-
-C TODO: This code was optimised for atom-32, consider moving it back to atom
-C dir(atom currently grabs this code), and write a 4-way version(7c/l).
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_MUL, 16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl re-use parameter space
-define(SAVE_RP,`PARAM_MUL')
-define(SAVE_UP,`PARAM_SIZE')
-
-define(`rp', `%edi')
-define(`up', `%esi')
-define(`n', `%ecx')
-define(`reg', `%edx')
-define(`cy', `%eax') C contains the return value
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME',0)
-
-PROLOGUE(mpn_bdiv_dbm1c)
- mov PARAM_SIZE, n C size
- mov up, SAVE_UP
- mov PARAM_SRC, up
- movd PARAM_MUL, %mm7
- mov rp, SAVE_RP
- mov PARAM_DST, rp
-
- movd (up), %mm0
- pmuludq %mm7, %mm0
- shr n
- mov PARAM_CARRY, cy
- jz L(eq1)
-
- movd 4(up), %mm1
- jc L(odd)
-
- lea 4(up), up
- pmuludq %mm7, %mm1
- movd %mm0, reg
- psrlq $32, %mm0
- sub reg, cy
- movd %mm0, reg
- movq %mm1, %mm0
- dec n
- mov cy, (rp)
- lea 4(rp), rp
- jz L(end)
-
-C ALIGN(16)
-L(top): movd 4(up), %mm1
- sbb reg, cy
-L(odd): movd %mm0, reg
- psrlq $32, %mm0
- pmuludq %mm7, %mm1
- sub reg, cy
- lea 8(up), up
- movd %mm0, reg
- movd (up), %mm0
- mov cy, (rp)
- sbb reg, cy
- movd %mm1, reg
- psrlq $32, %mm1
- sub reg, cy
- movd %mm1, reg
- pmuludq %mm7, %mm0
- dec n
- mov cy, 4(rp)
- lea 8(rp), rp
- jnz L(top)
-
-L(end): sbb reg, cy
-
-L(eq1): movd %mm0, reg
- psrlq $32, %mm0
- mov SAVE_UP, up
- sub reg, cy
- movd %mm0, reg
- emms
- mov cy, (rp)
- sbb reg, cy
-
- mov SAVE_RP, rp
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
deleted file mode 100644
index f7f461d56f..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
-
-dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C P4: 19.0 cycles/limb
-
-C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
-C being on the dependent chain and there being plenty of cycles available,
-C using an unaligned movq on every second iteration measured about 23 c/l.
-C
-
-defframe(PARAM_SHIFT, 24)
-defframe(PARAM_INVERSE,20)
-defframe(PARAM_DIVISOR,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- TEXT
-
-C mp_limb_t
-C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t inverse, int shift)
- ALIGN(32)
-PROLOGUE(mpn_pi1_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
-
- movl PARAM_SRC, %eax
-
- movl PARAM_DIVISOR, %ecx
-
- movd %ecx, %mm6
- movl PARAM_SHIFT, %ecx
-
- movd %ecx, %mm7 C shift
-
- C
-
- movl PARAM_INVERSE, %ecx
- movd %ecx, %mm5 C inv
-
- movl PARAM_DST, %ecx
- pxor %mm1, %mm1 C initial carry limb
- pxor %mm0, %mm0 C initial carry bit
-
- subl $1, %edx
- jz L(done)
-
- pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
-
-C The dependent chain here is as follows.
-C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
-C
-C Yet the loop measures 19.0 c/l, so obviously there's something gained
-C there over a straight reading of the chip documentation.
-
-L(top):
- C eax src, incrementing
- C ebx
- C ecx dst, incrementing
- C edx counter, size-1 iterations
- C
- C mm0 carry bit
- C mm1 carry limb
- C mm4 0x00000000FFFFFFFF
- C mm5 inverse
- C mm6 divisor
- C mm7 shift
-
- movd (%eax), %mm2
- movd 4(%eax), %mm3
- addl $4, %eax
- punpckldq %mm3, %mm2
-
- psrlq %mm7, %mm2
- pand %mm4, %mm2 C src
- psubq %mm0, %mm2 C src - cbit
-
- psubq %mm1, %mm2 C src - cbit - climb
- movq %mm2, %mm0
- psrlq $63, %mm0 C new cbit
-
- pmuludq %mm5, %mm2 C s*inverse
- movd %mm2, (%ecx) C q
- addl $4, %ecx
-
- movq %mm6, %mm1
- pmuludq %mm2, %mm1 C q*divisor
- psrlq $32, %mm1 C new climb
-
-L(entry):
- subl $1, %edx
- jnz L(top)
-
-L(done):
- movd (%eax), %mm2
- psrlq %mm7, %mm2 C src
- psubq %mm0, %mm2 C src - cbit
-
- psubq %mm1, %mm2 C src - cbit - climb
-
- pmuludq %mm5, %mm2 C s*inverse
- movd %mm2, (%ecx) C q
-
- emms
- ret
-
-EPILOGUE()
-
- ALIGN(16)
-C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C
-PROLOGUE(mpn_bdiv_q_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
-
- movl PARAM_DIVISOR, %ecx
-
- C eax src
- C ebx
- C ecx divisor
- C edx size-1
-
- movl %ecx, %eax
- bsfl %ecx, %ecx C trailing twos
-
- shrl %cl, %eax C d = divisor without twos
- movd %eax, %mm6
- movd %ecx, %mm7 C shift
-
- shrl %eax C d/2
-
- andl $127, %eax C d/2, 7 bits
-
-ifdef(`PIC',`
- LEA( binvert_limb_table, %ecx)
- movzbl (%eax,%ecx), %eax C inv 8 bits
-',`
- movzbl binvert_limb_table(%eax), %eax C inv 8 bits
-')
-
- C
-
- movd %eax, %mm5 C inv
-
- movd %eax, %mm0 C inv
-
- pmuludq %mm5, %mm5 C inv*inv
-
- C
-
- pmuludq %mm6, %mm5 C inv*inv*d
- paddd %mm0, %mm0 C 2*inv
-
- C
-
- psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
- pxor %mm5, %mm5
-
- paddd %mm0, %mm5
- pmuludq %mm0, %mm0 C inv*inv
-
- pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
-
- C
-
- pmuludq %mm6, %mm0 C inv*inv*d
- paddd %mm5, %mm5 C 2*inv
-
- movl PARAM_SRC, %eax
- movl PARAM_DST, %ecx
- pxor %mm1, %mm1 C initial carry limb
-
- C
-
- psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
-
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
- pushl %eax FRAME_pushl()
- movq %mm6, %mm0
- pmuludq %mm5, %mm0
- movd %mm0, %eax
- cmpl $1, %eax
- popl %eax FRAME_popl()')
-
- pxor %mm0, %mm0 C initial carry bit
- jmp L(entry)
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
deleted file mode 100644
index b3f3474e67..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
+++ /dev/null
@@ -1,95 +0,0 @@
-dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
-
-dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4.67
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5.25
-
-defframe(PARAM_SIZE, 20)
-defframe(PARAM_SRC2, 16)
-defframe(PARAM_SRC1, 12)
-defframe(PARAM_DST, 8)
-defframe(PARAM_CND, 4)
-
-dnl re-use parameter space
-define(SAVE_EBX,`PARAM_SRC1')
-
-define(`cnd', `%mm3')
-
- TEXT
- ALIGN(8)
-
- ALIGN(8)
-PROLOGUE(mpn_cnd_add_n)
-deflit(`FRAME',0)
- pxor %mm0, %mm0
-
- mov PARAM_CND, %eax
- neg %eax
- sbb %eax, %eax
- movd %eax, cnd
-
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
-
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
-
-L(top): movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm1
- pand cnd, %mm2
- paddq %mm2, %mm1
-
- paddq %mm1, %mm0
- movd %mm0, (%edx,%ecx,4)
-
- psrlq $32, %mm0
-
- add $1, %ecx
- jnz L(top)
-
- movd %mm0, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
deleted file mode 100644
index 339a23e0b6..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
-
-dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4.67
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5
-C P4 model 3-4 (Prescott) 5.25
-
-defframe(PARAM_SIZE, 20)
-defframe(PARAM_SRC2, 16)
-defframe(PARAM_SRC1, 12)
-defframe(PARAM_DST, 8)
-defframe(PARAM_CND, 4)
-
-dnl re-use parameter space
-define(SAVE_EBX,`PARAM_SRC1')
-
-define(`cnd', `%mm3')
-
- TEXT
- ALIGN(8)
-
- ALIGN(8)
-PROLOGUE(mpn_cnd_sub_n)
-deflit(`FRAME',0)
- pxor %mm0, %mm0
-
- mov PARAM_CND, %eax
- neg %eax
- sbb %eax, %eax
- movd %eax, cnd
-
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
-
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
-
-L(top): movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm1
- pand cnd, %mm2
- psubq %mm2, %mm1
-
- psubq %mm0, %mm1
- movd %mm1, (%edx,%ecx,4)
-
- psrlq $63, %mm1
-
- add $1, %ecx
- jz L(done_mm1)
-
- movd (%ebx,%ecx,4), %mm2
- movd (%eax,%ecx,4), %mm0
- pand cnd, %mm2
- psubq %mm2, %mm0
-
- psubq %mm1, %mm0
- movd %mm0, (%edx,%ecx,4)
-
- psrlq $63, %mm0
-
- add $1, %ecx
- jnz L(top)
-
- movd %mm0, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-L(done_mm1):
- movd %mm1, %eax
- mov SAVE_EBX, %ebx
- emms
- ret
-
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/dive_1.asm b/gmp/mpn/x86/pentium4/sse2/dive_1.asm
index 238f0dd8a5..c50ef7d29e 100644
--- a/gmp/mpn/x86/pentium4/sse2/dive_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/dive_1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -137,7 +126,7 @@ ifdef(`PIC',`
psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
movq %mm6, %mm0
pmuludq %mm5, %mm0
@@ -150,13 +139,13 @@ ifdef(`PIC',`
C The dependent chain here is as follows.
C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
C
C Yet the loop measures 19.0 c/l, so obviously there's something gained
C there over a straight reading of the chip documentation.
diff --git a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
index 0146fab117..7f973dbf98 100644
--- a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
@@ -1,32 +1,22 @@
dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
-dnl Copyright 1999-2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
index a94ae868b3..5071aae092 100644
--- a/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -1,206 +1,68 @@
/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2005, 2007-2010, 2014 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008,
+2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
-or both in parallel, as here.
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 32
-#define GMP_LIMB_BYTES 4
-
-/* 2600 MHz P4 Northwood */
-/* FFT tuning limit = 12500000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 24
-#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 2
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1N_PI1_METHOD 2
-#define DIV_QR_1_NORM_THRESHOLD 19
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-#define MUL_TOOM22_THRESHOLD 29
-#define MUL_TOOM33_THRESHOLD 113
-#define MUL_TOOM44_THRESHOLD 288
-#define MUL_TOOM6H_THRESHOLD 454
-#define MUL_TOOM8H_THRESHOLD 592
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 118
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 214
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 186
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 287
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 44
-#define SQR_TOOM3_THRESHOLD 173
-#define SQR_TOOM4_THRESHOLD 390
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 915
-
-#define MULMID_TOOM42_THRESHOLD 66
-
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 23
-
-#define MUL_FFT_MODF_THRESHOLD 1147 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 1147, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \
- { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 35, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
- { 43, 7}, { 23, 6}, { 49, 7}, { 27, 6}, \
- { 55, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
- { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
- { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
- { 39, 7}, { 79, 8}, { 43, 9}, { 23, 8}, \
- { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
- { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
- { 47, 9}, { 111,11}, { 31,10}, { 63, 9}, \
- { 143,10}, { 79, 9}, { 159,10}, { 111,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 207,12}, { 63,11}, \
- { 127,10}, { 287,11}, { 159,10}, { 335,11}, \
- { 191,10}, { 383,11}, { 223,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 319,10}, { 671,11}, \
- { 351,12}, { 191,11}, { 383,10}, { 799,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1055, 9}, \
- { 2111,10}, { 1119, 9}, { 2239,11}, { 607,12}, \
- { 319,11}, { 671,10}, { 1407,11}, { 735,10}, \
- { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \
- { 1599,11}, { 863,10}, { 1727, 9}, { 3455,12}, \
- { 447,11}, { 895,13}, { 255,12}, { 511,11}, \
- { 1055,10}, { 2111,11}, { 1119,10}, { 2239, 9}, \
- { 4479,12}, { 575,11}, { 1247,10}, { 2495, 9}, \
- { 4991,12}, { 639,11}, { 1471,10}, { 2943,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \
- { 2239,10}, { 4479,12}, { 1215,11}, { 2495,10}, \
- { 4991,13}, { 639,12}, { 1471,11}, { 2943,10}, \
- { 5887,11}, { 3007,13}, { 767,12}, { 1727,11}, \
- { 3455,13}, { 895,12}, { 1791,11}, { 3711,12}, \
- { 1983,11}, { 3967,10}, { 7935,14}, { 511,13}, \
- { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
- { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \
- { 1407,12}, { 2943,11}, { 5887,12}, { 3007,14}, \
- { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 158
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 896 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 896, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
- { 17, 5}, { 35, 6}, { 19, 5}, { 39, 6}, \
- { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
- { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
- { 47, 7}, { 27, 6}, { 55, 7}, { 31, 6}, \
- { 63, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \
- { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \
- { 31, 7}, { 63, 8}, { 39, 7}, { 79, 8}, \
- { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
- { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
- { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
- { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
- { 127,10}, { 79, 9}, { 159,10}, { 95, 9}, \
- { 191,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,11}, { 159,10}, { 319, 9}, { 639,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 223,12}, \
- { 127,11}, { 255,10}, { 511, 9}, { 1055,10}, \
- { 543,11}, { 287,10}, { 607,11}, { 319,12}, \
- { 191,11}, { 383,10}, { 767,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1055,11}, { 543,10}, \
- { 1119, 9}, { 2239,11}, { 607,12}, { 319,11}, \
- { 671,10}, { 1407,11}, { 735,10}, { 1471, 9}, \
- { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \
- { 863,10}, { 1727,12}, { 447,11}, { 991,13}, \
- { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \
- { 1119,10}, { 2239,12}, { 575,11}, { 1247,10}, \
- { 2495,12}, { 639,11}, { 1471,10}, { 2943,13}, \
- { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
- { 1727,10}, { 3455,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \
- { 2239,10}, { 4479,12}, { 1215,11}, { 2495,13}, \
- { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \
- { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1791,11}, { 3711,12}, { 1983,11}, { 3967,10}, \
- { 7935,14}, { 511,13}, { 1023,12}, { 2239,11}, \
- { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
- { 1279,12}, { 2623,13}, { 1407,12}, { 2943,11}, \
- { 5887,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 159
-#define SQR_FFT_THRESHOLD 7296
-
-#define MULLO_BASECASE_THRESHOLD 12
-#define MULLO_DC_THRESHOLD 55
-#define MULLO_MUL_N_THRESHOLD 14709
-
-#define DC_DIV_QR_THRESHOLD 38
-#define DC_DIVAPPR_Q_THRESHOLD 77
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 85
-
-#define INV_MULMOD_BNM1_THRESHOLD 56
-#define INV_NEWTON_THRESHOLD 121
-#define INV_APPR_THRESHOLD 93
-
-#define BINV_NEWTON_THRESHOLD 366
-#define REDC_1_TO_REDC_N_THRESHOLD 64
-
-#define MU_DIV_QR_THRESHOLD 2350
-#define MU_DIVAPPR_Q_THRESHOLD 2172
-#define MUPI_DIV_QR_THRESHOLD 62
-#define MU_BDIV_QR_THRESHOLD 2172
-#define MU_BDIV_Q_THRESHOLD 2304
-
-#define POWM_SEC_TABLE 1,19,102,615,2111
-
-#define MATRIX22_STRASSEN_THRESHOLD 23
-#define HGCD_THRESHOLD 88
-#define HGCD_APPR_THRESHOLD 93
-#define HGCD_REDUCE_THRESHOLD 5010
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 258
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 147
-#define SET_STR_PRECOMPUTE_THRESHOLD 894
-
-#define FAC_DSC_THRESHOLD 906
-#define FAC_ODD_THRESHOLD 28
+/* 2600 MHz Pentium 4 model 2 */
+
+/* Generated by tuneup.c, 2009-01-06, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 31
+#define MUL_TOOM3_THRESHOLD 119
+#define MUL_TOOM44_THRESHOLD 178
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 49
+#define SQR_TOOM3_THRESHOLD 165
+#define SQR_TOOM4_THRESHOLD 252
+
+#define MULLOW_BASECASE_THRESHOLD 15
+#define MULLOW_DC_THRESHOLD 44
+#define MULLOW_MUL_N_THRESHOLD 363
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_DC_THRESHOLD 33
+#define POWM_THRESHOLD 95
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD_THRESHOLD 64
+#define GCD_DC_THRESHOLD 310
+#define GCDEXT_DC_THRESHOLD 310
+#define JACOBI_BASE_METHOD 1
+
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 118
+#define SET_STR_PRECOMPUTE_THRESHOLD 1078
+
+#define MUL_FFT_TABLE { 560, 928, 1920, 5632, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 720
+#define MUL_FFT_THRESHOLD 9216
+
+#define SQR_FFT_TABLE { 592, 928, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 608
+#define SQR_FFT_THRESHOLD 5888
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1.asm
new file mode 100644
index 0000000000..0e95f13913
--- /dev/null
+++ b/gmp/mpn/x86/pentium4/sse2/mod_1.asm
@@ -0,0 +1,391 @@
+dnl Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder.
+
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl P4: 31 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C An idea was tried in the mul-by-inverse to process the last limb by a jump
+C back to the top of the loop skipping the -4(%esi) fetch. But that seemed
+C to produce slightly strange timings, like 9 and 10 limb operations about
+C the same speed. The jump would be successively taken and not-taken, which
+C in theory should predict ok, but perhaps isn't enjoyed by the chip.
+C Duplicating the loop for the last limb seems to be a couple of cycles
+C quicker too.
+C
+C Enhancements:
+C
+C The loop measures 31 cycles, but the dependent chain would suggest it
+C could be done with 30. Not sure where to start looking for the extra
+C cycle.
+
+
+dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl used, rather than plain "divl"s. Minimum value 2.
+dnl
+dnl The inverse takes about 80-90 cycles to calculate, but after that the
+dnl multiply is 31 c/l versus division at about 58 c/l.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY, 16) dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+ movl $32, %eax
+
+ movd %eax, %mm6 C l = 0, so 32-l = 32
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movd PARAM_DIVISOR, %mm5
+ pxor %mm7, %mm7 C l = 0
+
+ movd -4(%esi,%ecx,4), %mm0 C src high limb
+ leal -8(%esi,%ecx,4), %esi C &src[size-2]
+
+ movd PARAM_INVERSE, %mm4
+ subl $2, %ecx C size-2
+
+ psubq %mm5, %mm0 C high-divisor
+ movq %mm0, %mm2
+
+ psrlq $32, %mm0 C -1 if underflow
+
+ pand %mm5, %mm0 C divisor if underflow
+
+ paddq %mm2, %mm0 C addback if underflow
+ jz L(inverse_last) C if size==2
+ ja L(inverse_top) C if size>2
+
+
+ C if size==1
+ movl SAVE_ESI, %esi
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movl PARAM_CARRY, %edx
+ orl %ecx, %ecx
+ jz L(divide_done) C result==carry if size==0
+
+ movl PARAM_DIVISOR, %ebp
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+
+ movl PARAM_DIVISOR, %ebp
+ xorl %edx, %edx C result 0 if size==0
+
+ orl %ecx, %ecx
+ jz L(divide_done)
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ leal -1(%ecx), %edx
+ cmpl %ebp, %eax C c if high<divisor
+
+ cmovc( %edx, %ecx) C size-1 if high<divisor
+
+ movl $0, %edx C initial carry
+ cmovc( %eax, %edx) C src high limb if high<divisor
+
+ orl %ecx, %ecx
+ jz L(divide_done) C if size==1 and skip div
+
+
+L(start_1c):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+ cmpl $MUL_THRESHOLD, %ecx
+ jae L(mul_by_inverse)
+
+
+L(divide_top):
+ C eax
+ C ebx
+ C ecx counter, limbs, decrementing
+ C edx remainder
+ C esi src, decrementing
+ C edi
+ C ebp divisor
+
+ movl (%esi), %eax
+ subl $4, %esi
+
+ divl %ebp
+
+ subl $1, %ecx
+ jnz L(divide_top)
+
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx
+ C ecx size
+ C edx carry
+ C esi src
+ C edi
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ movd %edx, %mm1 C carry
+ movl %ecx, %edx C size
+ movl $31, %ecx
+
+ C
+
+ xorl %eax, %ecx C l = leading zeros on d
+ addl $1, %eax C 32-l
+
+ shll %cl, %ebp C normalize d
+ movd %ecx, %mm7 C l
+ leal -1(%edx), %ecx C size-1
+
+ movd %eax, %mm6 C 32-l
+ movl $-1, %edx
+ movl $-1, %eax
+
+ C
+
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1 / d)
+
+ movd %ebp, %mm5 C d
+ movd (%esi), %mm0 C src high limb
+ punpckldq %mm1, %mm0
+ psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
+
+ C
+
+ movd %eax, %mm4 C m
+
+
+C The dependent chain here consists of
+C
+C 2 paddd n1+n2
+C 8 pmuludq m*(n1+n2)
+C 2 paddq n2:nadj + m*(n1+n2)
+C 2 psrlq q1
+C 8 pmuludq d*q1
+C 2 psubq (n-d)-q1*d
+C 2 psrlq high mask
+C 2 pand d masked
+C 2 paddd n2+d addback
+C --
+C 30
+C
+C But it seems to run at 31 cycles, so presumably there's something else
+C going on.
+
+
+ ALIGN(16)
+L(inverse_top):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 1
+ C edx
+ C esi src, decrementing
+ C edi
+ C ebp
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd -4(%esi), %mm1 C next src limbs
+ movd (%esi), %mm2
+ leal -4(%esi), %esi
+
+ punpckldq %mm2, %mm1
+ psrlq %mm6, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+
+ subl $1, %ecx
+ jnz L(inverse_top)
+
+
+ C Least significant limb.
+ C Same code as the loop, but there's no -4(%esi) limb to fetch.
+
+L(inverse_last):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi &src[0]
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ movd (%esi), %mm1 C src[0]
+ psllq %mm7, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ movl SAVE_EBP, %ebp
+ pand %mm5, %mm3 C mask & d
+
+ movl SAVE_ESI, %esi
+ paddd %mm3, %mm0 C addback if necessary
+
+ psrld %mm7, %mm0
+
+ movd %mm0, %eax
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
deleted file mode 100644
index ee88babeee..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Optimize. The present code was written quite straightforwardly.
-C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
-C * Write a cps function that uses sse2 insns.
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 16
-C P4 model 3-4 (Prescott) 18
-
-C INPUT PARAMETERS
-C ap sp + 4
-C n sp + 8
-C b sp + 12
-C cps sp + 16
-
-define(`B1modb', `%mm1')
-define(`B2modb', `%mm2')
-define(`ap', `%edx')
-define(`n', `%eax')
-
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1_1p)
- push %ebx
- mov 8(%esp), ap
- mov 12(%esp), n
- mov 20(%esp), %ecx
- movd 8(%ecx), B1modb
- movd 12(%ecx), B2modb
-
- lea -4(ap,n,4), ap
-
-C FIXME: See comment in generic/mod_1_1.c.
- movd (ap), %mm7
- movd -4(ap), %mm4
- pmuludq B1modb, %mm7
- paddq %mm4, %mm7
- add $-2, n
- jz L(end)
-
- ALIGN(8)
-L(top): movq %mm7, %mm6
- psrlq $32, %mm7 C rh
- movd -8(ap), %mm0
- add $-4, ap
- pmuludq B2modb, %mm7
- pmuludq B1modb, %mm6
- add $-1, n
- paddq %mm0, %mm7
- paddq %mm6, %mm7
- jnz L(top)
-
-L(end): pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
- pand %mm7, %mm4 C rl
- psrlq $32, %mm7 C rh
- pmuludq B1modb, %mm7 C rh,cl
- paddq %mm4, %mm7 C rh,rl
- movd 4(%ecx), %mm4 C cnt
- psllq %mm4, %mm7 C rh,rl normalized
- movq %mm7, %mm2 C rl in low half
- psrlq $32, %mm7 C rh
- movd (%ecx), %mm1 C bi
- pmuludq %mm7, %mm1 C qh,ql
- paddq %mm2, %mm1 C qh-1,ql
- movd %mm1, %ecx C ql
- psrlq $32, %mm1 C qh-1
- movd 16(%esp), %mm3 C b
- pmuludq %mm1, %mm3 C (qh-1) * b
- psubq %mm3, %mm2 C r in low half (could use psubd)
- movd %mm2, %eax C r
- mov 16(%esp), %ebx
- sub %ebx, %eax C r
- cmp %eax, %ecx
- lea (%eax,%ebx), %edx
- cmovc( %edx, %eax)
- movd %mm4, %ecx C cnt
- cmp %ebx, %eax
- jae L(fix)
- emms
- pop %ebx
- shr %cl, %eax
- ret
-
-L(fix): sub %ebx, %eax
- emms
- pop %ebx
- shr %cl, %eax
- ret
-EPILOGUE()
-
-PROLOGUE(mpn_mod_1_1p_cps)
-C CAUTION: This is the same code as in k7/mod_1_1.asm
- push %ebp
- mov 12(%esp), %ebp
- push %esi
- bsr %ebp, %ecx
- push %ebx
- xor $31, %ecx
- mov 16(%esp), %esi
- sal %cl, %ebp
- mov %ebp, %edx
- not %edx
- mov $-1, %eax
- div %ebp
- mov %eax, (%esi) C store bi
- mov %ecx, 4(%esi) C store cnt
- xor %ebx, %ebx
- sub %ebp, %ebx
- mov $1, %edx
- shld %cl, %eax, %edx
- imul %edx, %ebx
- mul %ebx
- add %ebx, %edx
- not %edx
- imul %ebp, %edx
- add %edx, %ebp
- cmp %edx, %eax
- cmovc( %ebp, %edx)
- shr %cl, %ebx
- mov %ebx, 8(%esi) C store B1modb
- shr %cl, %edx
- mov %edx, 12(%esi) C store B2modb
- pop %ebx
- pop %esi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm b/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
deleted file mode 100644
index eb2edb6297..0000000000
--- a/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
+++ /dev/null
@@ -1,269 +0,0 @@
-dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO:
-C * Optimize. The present code was written quite straightforwardly.
-C * Optimize post-loop reduction code.
-C * Write a cps function that uses sse2 insns.
-
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 3.4
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4
-C P4 model 3-4 (Prescott) 4.5
-
-C INPUT PARAMETERS
-C ap sp + 4
-C n sp + 8
-C b sp + 12
-C cps sp + 16
-
-define(`B1modb', `%mm1')
-define(`B2modb', `%mm2')
-define(`B3modb', `%mm3')
-define(`B4modb', `%mm4')
-define(`B5modb', `%mm5')
-define(`ap', `%edx')
-define(`n', `%eax')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
- push %ebx
- mov 8(%esp), ap
- mov 12(%esp), n
- mov 20(%esp), %ecx
-
- movd 8(%ecx), B1modb
- movd 12(%ecx), B2modb
- movd 16(%ecx), B3modb
- movd 20(%ecx), B4modb
- movd 24(%ecx), B5modb
-
- mov n, %ebx
- lea -4(ap,n,4), ap
- and $3, %ebx
- je L(b0)
- cmp $2, %ebx
- jc L(b1)
- je L(b2)
-
-L(b3): movd -4(ap), %mm7
- pmuludq B1modb, %mm7
- movd -8(ap), %mm6
- paddq %mm6, %mm7
- movd (ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm7
- lea -24(ap), ap
- add $-3, n
- jz L(end)
- jmp L(top)
-
-L(b0): movd -8(ap), %mm7
- pmuludq B1modb, %mm7
- movd -12(ap), %mm6
- paddq %mm6, %mm7
- movd -4(ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm7
- movd (ap), %mm6
- pmuludq B3modb, %mm6
- paddq %mm6, %mm7
- lea -28(ap), ap
- add $-4, n
- jz L(end)
- jmp L(top)
-
-L(b1): movd (ap), %mm7
- lea -16(ap), ap
- dec n
- jz L(x)
- jmp L(top)
-
-L(b2): movd -4(ap), %mm7 C rl
- punpckldq (ap), %mm7 C rh
- lea -20(ap), ap
- add $-2, n
- jz L(end)
-
- ALIGN(8)
-L(top): movd 4(ap), %mm0
- pmuludq B1modb, %mm0
- movd 0(ap), %mm6
- paddq %mm6, %mm0
-
- movd 8(ap), %mm6
- pmuludq B2modb, %mm6
- paddq %mm6, %mm0
-
- movd 12(ap), %mm6
- pmuludq B3modb, %mm6
- paddq %mm6, %mm0
-
- movq %mm7, %mm6
- psrlq $32, %mm7 C rh
- pmuludq B5modb, %mm7
- pmuludq B4modb, %mm6
-
- paddq %mm0, %mm7
- paddq %mm6, %mm7
-
- add $-16, ap
- add $-4, n
- jnz L(top)
-
-L(end): pcmpeqd %mm4, %mm4
- psrlq $32, %mm4 C 0x00000000FFFFFFFF
- pand %mm7, %mm4 C rl
- psrlq $32, %mm7 C rh
- pmuludq B1modb, %mm7 C rh,cl
- paddq %mm4, %mm7 C rh,rl
-L(x): movd 4(%ecx), %mm4 C cnt
- psllq %mm4, %mm7 C rh,rl normalized
- movq %mm7, %mm2 C rl in low half
- psrlq $32, %mm7 C rh
- movd (%ecx), %mm1 C bi
- pmuludq %mm7, %mm1 C qh,ql
- paddq %mm2, %mm1 C qh-1,ql
- movd %mm1, %ecx C ql
- psrlq $32, %mm1 C qh-1
- movd 16(%esp), %mm3 C b
- pmuludq %mm1, %mm3 C (qh-1) * b
- psubq %mm3, %mm2 C r in low half (could use psubd)
- movd %mm2, %eax C r
- mov 16(%esp), %ebx
- sub %ebx, %eax C r
- cmp %eax, %ecx
- lea (%eax,%ebx), %edx
- cmovc( %edx, %eax)
- movd %mm4, %ecx C cnt
- cmp %ebx, %eax
- jae L(fix)
- emms
- pop %ebx
- shr %cl, %eax
- ret
-
-L(fix): sub %ebx, %eax
- emms
- pop %ebx
- shr %cl, %eax
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
-C CAUTION: This is the same code as in k7/mod_1_4.asm
- push %ebp
- push %edi
- push %esi
- push %ebx
- mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
- mov 24(%esp), %ebx
- bsr %ebx, %ecx
- xor $31, %ecx
- sal %cl, %ebx C b << cnt
- mov %ebx, %edx
- not %edx
- mov $-1, %eax
- div %ebx
- xor %edi, %edi
- sub %ebx, %edi
- mov $1, %esi
- mov %eax, (%ebp) C store bi
- mov %ecx, 4(%ebp) C store cnt
- shld %cl, %eax, %esi
- imul %edi, %esi
- mov %eax, %edi
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 8(%ebp) C store B1modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 12(%ebp) C store B2modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 16(%ebp) C store B3modb
-
- not %edx
- imul %ebx, %edx
- lea (%edx,%ebx), %esi
- cmp %edx, %eax
- cmovnc( %edx, %esi)
- mov %edi, %eax
- mul %esi
-
- add %esi, %edx
- shr %cl, %esi
- mov %esi, 20(%ebp) C store B4modb
-
- not %edx
- imul %ebx, %edx
- add %edx, %ebx
- cmp %edx, %eax
- cmovnc( %edx, %ebx)
-
- shr %cl, %ebx
- mov %ebx, 24(%ebp) C store B5modb
-
- pop %ebx
- pop %esi
- pop %edi
- pop %ebp
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
index 31e25b79bc..1598b41785 100644
--- a/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2000-2003 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/mode1o.asm b/gmp/mpn/x86/pentium4/sse2/mode1o.asm
index 778c478169..2f0b177a00 100644
--- a/gmp/mpn/x86/pentium4/sse2/mode1o.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mode1o.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -112,7 +101,7 @@ ifdef(`PIC',`
psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
- ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
pushl %eax FRAME_pushl()
movd %mm6, %eax
imul PARAM_DIVISOR, %eax
@@ -124,13 +113,13 @@ ifdef(`PIC',`
C The dependent chain here is as follows.
C
-C latency
-C psubq s = (src-cbit) - climb 2
-C pmuludq q = s*inverse 8
-C pmuludq prod = q*divisor 8
-C psrlq climb = high(prod) 2
-C --
-C 20
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
C
C Yet the loop measures 19.0 c/l, so obviously there's something gained
C there over a straight reading of the chip documentation.
diff --git a/gmp/mpn/x86/pentium4/sse2/mul_1.asm b/gmp/mpn/x86/pentium4/sse2/mul_1.asm
index 6347b8bf62..07be951921 100644
--- a/gmp/mpn/x86/pentium4/sse2/mul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mul_1.asm
@@ -1,48 +1,37 @@
dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2005, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 4.17
-C P6 model 13 (Dothan) 4.17
-C P4 model 0-1 (Willamette) 4
-C P4 model 2 (Northwood) 4
-C P4 model 3-4 (Prescott) 4.55
-
C TODO:
C * Tweak eax/edx offsets in loop as to save some lea's
C * Perhaps software pipeline small-case code
+C cycles/limb
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.17
+C P4 model 0-1 (Willamette): 4
+C P4 model 2 (Northwood): 4
+C P4 model 3-4 (Prescott): 4.55
+
C INPUT PARAMETERS
C rp sp + 4
C up sp + 8
@@ -51,13 +40,22 @@ C v0 sp + 16
TEXT
ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
PROLOGUE(mpn_mul_1)
- pxor %mm6, %mm6
-L(ent): mov 4(%esp), %edx
+ mov 4(%esp), %edx
mov 8(%esp), %eax
mov 12(%esp), %ecx
movd 16(%esp), %mm7
- cmp $4, %ecx
+ pxor %mm6, %mm6
+L(ent): cmp $4, %ecx
jnc L(big)
L(lp0): movd (%eax), %mm0
@@ -158,7 +156,3 @@ L(end): pmuludq %mm7, %mm2
emms
ret
EPILOGUE()
-PROLOGUE(mpn_mul_1c)
- movd 20(%esp), %mm6
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
index 6e3775ae09..2628e5eb72 100644
--- a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
+++ b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
@@ -1,32 +1,21 @@
dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/popcount.asm b/gmp/mpn/x86/pentium4/sse2/popcount.asm
index b8238b9b66..cb982ade46 100644
--- a/gmp/mpn/x86/pentium4/sse2/popcount.asm
+++ b/gmp/mpn/x86/pentium4/sse2/popcount.asm
@@ -1,66 +1,52 @@
dnl X86-32 and X86-64 mpn_popcount using SSE2.
-dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
-
+dnl Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C 32-bit popcount hamdist
-C cycles/limb cycles/limb
-C P5 -
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 4
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) 3.9
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C AMD K6 -
-C AMD K7 -
-C AMD K8 ?
-
-C 64-bit popcount hamdist
-C cycles/limb cycles/limb
-C P4 model 4 (Nocona): 8
-C AMD K8,K9 7.5
-C AMD K10 3.5
-C Intel core2 3.68
-C Intel corei 3.15
-C Intel atom 10.8
-C VIA nano 6.5
+C 32-bit popcount hamdist
+C cycles/limb cycles/limb
+C P5: -
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 3.9
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C K6: -
+C K7: -
+C K8: ?
+
+C 64-bit popcount hamdist
+C cycles/limb cycles/limb
+C P4 model 4 (Nocona): 8
+C K8: 7.5
+C K10: 3.5
+C P6-15: 3.68
C TODO
C * Make a mpn_hamdist based on this. Alignment could either be handled by
C using movdqu for one operand and movdqa for the other, or by painfully
-C shifting as we go. Unfortunately, there seem to be no usable shift
+C shifting as we go. Unfortunately, there seem to be no useable shift
C instruction, except for one that takes an immediate count.
C * It would probably be possible to cut a few cycles/limb using software
C pipelining.
diff --git a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
index f421d1323e..bbf43245cb 100644
--- a/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
@@ -1,32 +1,21 @@
dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
-dnl Copyright 2001-2004 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
index 2dd57d25d9..fc56f164ed 100644
--- a/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
+++ b/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
@@ -1,32 +1,21 @@
dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -41,7 +30,7 @@ C * Look into different loop alignment, we now expand the code about 50 bytes
C with possibly needless alignment.
C * Use OSP, should solve feed-in latency problems.
C * Address relative slowness for un<=3 for Pentium M. The old code is there
-C considerably faster. (1:20/14, 2:34:32, 3:66/57)
+C consideraly faster. (1:20/14, 2:34:32, 3:66/57)
C INPUT PARAMETERS
C rp sp + 4
diff --git a/gmp/mpn/x86/pentium4/sse2/sub_n.asm b/gmp/mpn/x86/pentium4/sse2/sub_n.asm
index 5ba1c018ec..02d5f01474 100644
--- a/gmp/mpn/x86/pentium4/sse2/sub_n.asm
+++ b/gmp/mpn/x86/pentium4/sse2/sub_n.asm
@@ -1,44 +1,37 @@
dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction.
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C dst!=src1,2 dst==src1 dst==src2
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 4 6 6
-C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
+C 6.0 cycles/limb if dst==src1 or dst==src2
+C P4 Prescott: >= 5 cycles/limb
+
+
+C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_sub_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C The main loop code is 2x unrolled so that the carry bit can alternate
+C between mm0 and mm1.
defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
@@ -54,8 +47,10 @@ define(SAVE_EBX,`PARAM_SRC1')
PROLOGUE(mpn_sub_nc)
deflit(`FRAME',0)
+
movd PARAM_CARRY, %mm0
jmp L(start_nc)
+
EPILOGUE()
ALIGN(8)
@@ -63,16 +58,16 @@ PROLOGUE(mpn_sub_n)
deflit(`FRAME',0)
pxor %mm0, %mm0
L(start_nc):
- mov PARAM_SRC1, %eax
- mov %ebx, SAVE_EBX
- mov PARAM_SRC2, %ebx
- mov PARAM_DST, %edx
- mov PARAM_SIZE, %ecx
+ movl PARAM_SRC1, %eax
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC2, %ebx
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
- lea (%eax,%ecx,4), %eax C src1 end
- lea (%ebx,%ecx,4), %ebx C src2 end
- lea (%edx,%ecx,4), %edx C dst end
- neg %ecx C -size
+ leal (%eax,%ecx,4), %eax C src1 end
+ leal (%ebx,%ecx,4), %ebx C src2 end
+ leal (%edx,%ecx,4), %edx C dst end
+ negl %ecx C -size
L(top):
C eax src1 end
@@ -90,7 +85,7 @@ L(top):
psrlq $63, %mm1
- add $1, %ecx
+ addl $1, %ecx
jz L(done_mm1)
movd (%eax,%ecx,4), %mm0
@@ -102,17 +97,18 @@ L(top):
psrlq $63, %mm0
- add $1, %ecx
+ addl $1, %ecx
jnz L(top)
+
movd %mm0, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
L(done_mm1):
movd %mm1, %eax
- mov SAVE_EBX, %ebx
+ movl SAVE_EBX, %ebx
emms
ret
diff --git a/gmp/mpn/x86/pentium4/sse2/submul_1.asm b/gmp/mpn/x86/pentium4/sse2/submul_1.asm
index 020675bd7b..ceb41f2ac0 100644
--- a/gmp/mpn/x86/pentium4/sse2/submul_1.asm
+++ b/gmp/mpn/x86/pentium4/sse2/submul_1.asm
@@ -1,71 +1,60 @@
dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
dnl subtract the result from a second limb vector.
-dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C P6 model 0-8,10-12 -
-C P6 model 9 (Banias) 6.8
-C P6 model 13 (Dothan) 6.9
-C P4 model 0-1 (Willamette) ?
-C P4 model 2 (Northwood) 5.87
-C P4 model 3-4 (Prescott) 6.5
+C P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon
+C (stepping 10).
-C This code represents a step forwards compared to the code available before
-C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
-C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
-C Prescott compared to the old code.
+
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C This code is not particularly good at 7 c/l. The dependent chain is only
+C 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that
+C speed isn't achieved.
C
C The arrangements made here to get a two instruction dependent chain are
-C slightly subtle. In the loop the carry (or borrow rather) is a negative so
-C that a paddq can be used to give a low limb ready to store, and a high limb
-C ready to become the new carry after a psrlq.
+C slightly subtle. In the loop the carry (or borrow rather) is a negative
+C so that a paddq can be used to give a low limb ready to store, and a high
+C limb ready to become the new carry after a psrlq.
C
-C If the carry was a simple twos complement negative then the psrlq shift would
-C need to bring in 0 bits or 1 bits according to whether the high was zero or
-C non-zero, since a non-zero value would represent a negative needing sign
-C extension. That wouldn't be particularly easy to arrange and certainly would
-C add an instruction to the dependent chain, so instead an offset is applied so
-C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to
-C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
-C always positive and can always have 0 bits shifted in, which is what psrlq
-C does.
+C If the carry was a simple twos complement negative then the psrlq shift
+C would need to bring in 0 bits or 1 bits according to whether the high was
+C zero or non-zero, since a non-zero value would represent a negative
+C needing sign extension. That wouldn't be particularly easy to arrange and
+C certainly would add an instruction to the dependent chain, so instead an
+C offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in
+C the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to
+C 0xFFFFFFFF and is therefore always positive and can always have 0 bits
+C shifted in, which is what psrlq does.
C
C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
C done off the dependent chain. The total adjustment then is to add
-C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
-C to remove the offset from the current carry, for a net add of
-C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when
-C fetched.
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract
+C 0x00000000FFFFFFFF to remove the offset from the current carry, for a net
+C add of 0xFFFFFFFE00000001. In the code this is applied to the destination
+C limb when fetched.
C
C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
C negative, which is how it's undone for the return value, but that doesn't
@@ -91,16 +80,16 @@ deflit(`FRAME',0)
pxor %mm1, %mm1 C initial borrow
L(start_1c):
- mov PARAM_SRC, %eax
+ movl PARAM_SRC, %eax
pcmpeqd %mm0, %mm0
movd PARAM_MULTIPLIER, %mm7
pcmpeqd %mm6, %mm6
- mov PARAM_DST, %edx
+ movl PARAM_DST, %edx
psrlq $32, %mm0 C 0x00000000FFFFFFFF
- mov PARAM_SIZE, %ecx
+ movl PARAM_SIZE, %ecx
psllq $32, %mm6 C 0xFFFFFFFF00000000
psubq %mm0, %mm6 C 0xFFFFFFFE00000001
@@ -108,75 +97,32 @@ L(start_1c):
psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
- movd (%eax), %mm3 C up
- movd (%edx), %mm4 C rp
-
- add $-1, %ecx
- paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
- jnz L(gt1)
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
- movd %mm0, (%edx) C result
- jmp L(rt)
-
-L(gt1): movd 4(%eax), %mm1 C up
- movd 4(%edx), %mm2 C rp
-
- add $-1, %ecx
- jz L(eev)
-
- ALIGN(16)
-L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ C eax src, incrementing
+ C ebx
+ C ecx loop counter, decrementing
+ C edx dst, incrementing
+ C
+ C mm0 0xFFFFFFFF - borrow
+ C mm6 0xFFFFFFFE00000001
+ C mm7 multiplier
+
+L(loop):
+ movd (%eax), %mm1 C src
+ leal 4(%eax), %eax
+ movd (%edx), %mm2 C dst
+ paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
pmuludq %mm7, %mm1
- psubq %mm3, %mm4 C prod
- movd 8(%eax), %mm3 C up
- paddq %mm4, %mm0 C borrow
- movd 8(%edx), %mm4 C rp
- movd %mm0, (%edx) C result
- psrlq $32, %mm0
-
- add $-1, %ecx
- jz L(eod)
-
- paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
psubq %mm1, %mm2 C prod
- movd 12(%eax), %mm1 C up
paddq %mm2, %mm0 C borrow
- movd 12(%edx), %mm2 C rp
- movd %mm0, 4(%edx) C result
- psrlq $32, %mm0
-
- lea 8(%eax), %eax
- lea 8(%edx), %edx
- add $-1, %ecx
- jnz L(top)
-
-
-L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm1
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
+ subl $1, %ecx
movd %mm0, (%edx) C result
psrlq $32, %mm0
- psubq %mm1, %mm2 C prod
- paddq %mm2, %mm0 C borrow
- movd %mm0, 4(%edx) C result
-L(rt): psrlq $32, %mm0
+ leal 4(%edx), %edx
+ jnz L(loop)
+
movd %mm0, %eax
- not %eax
+ notl %eax
emms
ret
-L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
- pmuludq %mm7, %mm3
- psubq %mm1, %mm2 C prod
- paddq %mm2, %mm0 C borrow
- movd %mm0, 4(%edx) C result
- psrlq $32, %mm0
- psubq %mm3, %mm4 C prod
- paddq %mm4, %mm0 C borrow
- movd %mm0, 8(%edx) C result
- jmp L(rt)
EPILOGUE()
diff --git a/gmp/mpn/x86/rshift.asm b/gmp/mpn/x86/rshift.asm
index a60dcaa4b2..8e33eabd61 100644
--- a/gmp/mpn/x86/rshift.asm
+++ b/gmp/mpn/x86/rshift.asm
@@ -1,43 +1,33 @@
dnl x86 mpn_rshift -- mpn right shift.
-dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C P54 7.5
-C P55 7.0
-C P6 2.5
-C K6 4.5
-C K7 5.0
-C P4 16.5
+C P54: 7.5
+C P55: 7.0
+C P6: 2.5
+C K6: 4.5
+C K7: 5.0
+C P4: 16.5
C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
diff --git a/gmp/mpn/x86/sec_tabselect.asm b/gmp/mpn/x86/sec_tabselect.asm
deleted file mode 100644
index c7c2e059f1..0000000000
--- a/gmp/mpn/x86/sec_tabselect.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-dnl x86 mpn_sec_tabselect.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P5 ?
-C P6 model 0-8,10-12 ?
-C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) ?
-C P4 model 0 (Willamette) ?
-C P4 model 1 (?) ?
-C P4 model 2 (Northwood) 4.5
-C P4 model 3 (Prescott) ?
-C P4 model 4 (Nocona) ?
-C Intel Atom ?
-C AMD K6 ?
-C AMD K7 3.4
-C AMD K8 ?
-C AMD K10 ?
-
-C NOTES
-C * This has not been tuned for any specific processor. Its speed should not
-C be too bad, though.
-C * Using SSE2 could result in many-fold speedup.
-
-C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
-define(`rp', `%edi')
-define(`tp', `%esi')
-define(`n', `%ebx')
-define(`nents', `%ecx')
-define(`which', `36(%esp)')
-
-define(`i', `%ebp')
-define(`maskp', `20(%esp)')
-define(`maskn', `32(%esp)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- push %edi
- push %esi
- push %ebx
- push %ebp
- mov 20(%esp), rp
- mov 24(%esp), tp
- mov 28(%esp), n
- mov 32(%esp), nents
-
- lea (rp,n,4), rp
- lea (tp,n,4), tp
- sub nents, which
-L(outer):
- mov which, %eax
- add nents, %eax
- neg %eax C set CF iff 'which' != k
- sbb %eax, %eax
- mov %eax, maskn
- not %eax
- mov %eax, maskp
-
- mov n, i
- neg i
-
- ALIGN(16)
-L(top): mov (tp,i,4), %eax
- and maskp, %eax
- mov (rp,i,4), %edx
- and maskn, %edx
- or %edx, %eax
- mov %eax, (rp,i,4)
- inc i
- js L(top)
-
-L(end): mov n, %eax
- lea (tp,%eax,4), tp
- dec nents
- jne L(outer)
-
-L(outer_end):
- pop %ebp
- pop %ebx
- pop %esi
- pop %edi
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86/sqr_basecase.asm b/gmp/mpn/x86/sqr_basecase.asm
index 39f8a89805..9a7e13327b 100644
--- a/gmp/mpn/x86/sqr_basecase.asm
+++ b/gmp/mpn/x86/sqr_basecase.asm
@@ -1,43 +1,32 @@
dnl x86 generic mpn_sqr_basecase -- square an mpn number.
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
-
+dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/crossproduct cycles/triangleproduct
-C P5
-C P6
-C K6
-C K7
-C P4
+C P5:
+C P6:
+C K6:
+C K7:
+C P4:
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
diff --git a/gmp/mpn/x86/t-zdisp.sh b/gmp/mpn/x86/t-zdisp.sh
index 61efdd6c4f..6c55067b6c 100755
--- a/gmp/mpn/x86/t-zdisp.sh
+++ b/gmp/mpn/x86/t-zdisp.sh
@@ -2,31 +2,20 @@
#
# Copyright 2000 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 3 of the License, or (at
+# your option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: cd $(builddir)/mpn
diff --git a/gmp/mpn/x86/t-zdisp2.pl b/gmp/mpn/x86/t-zdisp2.pl
index b441b6579a..d5e2d93dc0 100755
--- a/gmp/mpn/x86/t-zdisp2.pl
+++ b/gmp/mpn/x86/t-zdisp2.pl
@@ -2,31 +2,20 @@
#
# Copyright 2001, 2002 Free Software Foundation, Inc.
#
-# This file is part of the GNU MP Library.
+# This file is part of the GNU MP Library.
#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of either:
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 3 of the License, or (at
+# your option) any later version.
#
-# * the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your
-# option) any later version.
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
#
-# or
-#
-# * the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any
-# later version.
-#
-# or both in parallel, as here.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-# for more details.
-#
-# You should have received copies of the GNU General Public License and the
-# GNU Lesser General Public License along with the GNU MP Library. If not,
-# see https://www.gnu.org/licenses/.
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
# Usage: cd $(builddir)/mpn
@@ -82,7 +71,7 @@ sub process {
}
}
-# Ensure we're using the right SQR_TOOM2_THRESHOLD for the part of the
+# Ensure we're using the right SQR_KARATSUBA_THRESHOLD for the part of the
# tree being processed.
sub process_mparam {
my $file = "$File::Find::dir/gmp-mparam.h";
@@ -90,10 +79,10 @@ sub process_mparam {
print "$file\n" if $opt{'t'};
open MPARAM, "<$file" or die;
while (<MPARAM>) {
- if (/^#define SQR_TOOM2_THRESHOLD[ \t]*([0-9][0-9]*)/) {
+ if (/^#define SQR_KARATSUBA_THRESHOLD[ \t]*([0-9][0-9]*)/) {
open KARA, ">$tempfile" or die;
- print KARA "define(\`SQR_TOOM2_THRESHOLD',$1)\n\n";
- print "define(\`SQR_TOOM2_THRESHOLD',$1)\n" if $opt{'t'};
+ print KARA "define(\`SQR_KARATSUBA_THRESHOLD',$1)\n\n";
+ print "define(\`SQR_KARATSUBA_THRESHOLD',$1)\n" if $opt{'t'};
close KARA or die;
last;
}
diff --git a/gmp/mpn/x86/udiv.asm b/gmp/mpn/x86/udiv.asm
index a3ee08860f..5c7d3f3533 100644
--- a/gmp/mpn/x86/udiv.asm
+++ b/gmp/mpn/x86/udiv.asm
@@ -1,32 +1,21 @@
dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/umul.asm b/gmp/mpn/x86/umul.asm
index 34fe434400..d0116de6d9 100644
--- a/gmp/mpn/x86/umul.asm
+++ b/gmp/mpn/x86/umul.asm
@@ -1,32 +1,21 @@
dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
diff --git a/gmp/mpn/x86/x86-defs.m4 b/gmp/mpn/x86/x86-defs.m4
index 1538b6820c..5b4a8e1fad 100644
--- a/gmp/mpn/x86/x86-defs.m4
+++ b/gmp/mpn/x86/x86-defs.m4
@@ -4,33 +4,23 @@ divert(-1)
dnl m4 macros for x86 assembler.
-dnl Copyright 1999-2003, 2007, 2010, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
+dnl Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
dnl Notes:
@@ -51,7 +41,7 @@ dnl This is only a problem in macro definitions, not in ordinary text,
dnl and not in macro parameters like text passed to forloop() or ifdef().
-deflit(GMP_LIMB_BYTES, 4)
+deflit(BYTES_PER_MP_LIMB, 4)
dnl Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL. We
@@ -68,41 +58,24 @@ dnl order they appear in that structure.
define(CPUVEC_FUNCS_LIST,
``add_n',
-`addlsh1_n',
-`addlsh2_n',
`addmul_1',
-`addmul_2',
-`bdiv_dbm1c',
-`cnd_add_n',
-`cnd_sub_n',
-`com',
`copyd',
`copyi',
`divexact_1',
+`divexact_by3c',
`divrem_1',
`gcd_1',
`lshift',
-`lshiftc',
`mod_1',
-`mod_1_1p',
-`mod_1_1p_cps',
-`mod_1s_2p',
-`mod_1s_2p_cps',
-`mod_1s_4p',
-`mod_1s_4p_cps',
`mod_34lsub1',
`modexact_1c_odd',
`mul_1',
`mul_basecase',
-`mullo_basecase',
`preinv_divrem_1',
`preinv_mod_1',
-`redc_1',
-`redc_2',
`rshift',
`sqr_basecase',
`sub_n',
-`sublsh1_n',
`submul_1'')
@@ -922,7 +895,7 @@ dnl movl_code_address(L(foo),%eax)
dnl
dnl This macro is only meant for use in ASSERT()s or when testing, since
dnl the PIC sequence it generates will want to be done with a ret balancing
-dnl the call on CPUs with return address branch prediction.
+dnl the call on CPUs with return address branch predition.
dnl
dnl The addl generated here has a backward reference to the label, and so
dnl won't suffer from the two forwards references bug in old gas (described
@@ -955,9 +928,7 @@ m4_assert_numargs(1)
dnl Usage LEA(symbol,reg)
-define(`LEA',
-m4_assert_numargs(2)
-`ifdef(`PIC',`
+define(`LEA',`
define(`EPILOGUE_cpu',
`
L(movl_eip_`'substr($2,1)):
@@ -965,12 +936,11 @@ L(movl_eip_`'substr($2,1)):
ret_internal
SIZE($'`1, .-$'`1)')
- call L(movl_eip_`'substr($2,1))
- addl $_GLOBAL_OFFSET_TABLE_, $2
- movl $1@GOT($2), $2
-',`
- movl `$'$1, $2
-')')
+ call L(movl_eip_`'substr($2,1))
+ addl $_GLOBAL_OFFSET_TABLE_, $2
+ movl $1@GOT($2), $2
+')
+
define(`DEF_OBJECT',
m4_assert_numargs_range(1,2)
@@ -983,17 +953,4 @@ define(`END_OBJECT',
m4_assert_numargs(1)
` SIZE(`$1',.-`$1')')
-dnl Usage: CALL(funcname)
-dnl
-
-define(`CALL',
-m4_assert_numargs(1)
-`ifdef(`PIC',
- `call GSYM_PREFIX`'$1@PLT',
- `call GSYM_PREFIX`'$1')')
-
-ifdef(`PIC',
-`define(`PIC_WITH_EBX')',
-`undefine(`PIC_WITH_EBX')')
-
divert`'dnl
diff --git a/gmp/mpn/x86_64/README b/gmp/mpn/x86_64/README
index 9c8a586622..c89f841027 100644
--- a/gmp/mpn/x86_64/README
+++ b/gmp/mpn/x86_64/README
@@ -3,28 +3,17 @@ Copyright 2003, 2004, 2006, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/.
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
diff --git a/gmp/mpn/x86_64/addaddmul_1msb0.asm b/gmp/mpn/x86_64/addaddmul_1msb0.asm
index 87c21b4aca..89e7bed980 100644
--- a/gmp/mpn/x86_64/addaddmul_1msb0.asm
+++ b/gmp/mpn/x86_64/addaddmul_1msb0.asm
@@ -3,41 +3,26 @@ dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.167
-C AMD K10 2.167
-C Intel P4 12.0
-C Intel core2 4.0
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
+C K8: 2.167
+C P4: 12.0
+C P6-15: 4.0
C TODO
C * Perhaps handle various n mod 3 sizes better. The code now is too large.
@@ -82,7 +67,7 @@ L(top): mul %r9
mul %r8
add %rax, %r10
mov -16(bp,n,8), %rax
- mov $0, R32(%r11)
+ mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
@@ -92,7 +77,7 @@ L(top): mul %r9
mul %r8
add %rax, %r11
mov -8(bp,n,8), %rax
- mov $0, R32(%r12)
+ mov $0, %r12d
adc %rdx, %r12
mul %r9
add %rax, %r11
@@ -102,7 +87,7 @@ L(top): mul %r9
add %rax, %r12
mov %r11, -8(rp,n,8)
mov (bp,n,8), %rax
- mov $0, R32(%r10)
+ mov $0, %r10d
adc %rdx, %r10
add $3, n
js L(top)
@@ -119,7 +104,7 @@ L(end): cmp $1, R32(n)
mul %r8
add %rax, %r10
mov -16(bp), %rax
- mov $0, R32(%r11)
+ mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
@@ -129,7 +114,7 @@ L(end): cmp $1, R32(n)
mul %r8
add %rax, %r11
mov -8(bp), %rax
- mov $0, R32(%r12)
+ mov $0, %r12d
adc %rdx, %r12
mul %r9
add %rax, %r11
@@ -148,7 +133,7 @@ L(end): cmp $1, R32(n)
mul %r8
add %rax, %r10
mov -8(bp), %rax
- mov $0, R32(%r11)
+ mov $0, %r11d
adc %rdx, %r11
mul %r9
add %rax, %r10
diff --git a/gmp/mpn/x86_64/aorrlsh1_n.asm b/gmp/mpn/x86_64/addlsh1_n.asm
index 6ee0872823..e142f9ef9e 100644
--- a/gmp/mpn/x86_64/aorrlsh1_n.asm
+++ b/gmp/mpn/x86_64/addlsh1_n.asm
@@ -1,55 +1,37 @@
dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-dnl Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2
-C AMD K10 2
-C AMD bd1 ?
-C AMD bobcat ?
-C Intel P4 13
-C Intel core2 3.45
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 2
+C K10: 2
+C P4: 13
+C P6-15: 3.45
C Sometimes speed degenerates, supposedly related to that some operand
C alignments cause cache conflicts.
C The speed is limited by decoding/issue bandwidth. There are 22 instructions
-C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
+C in the loop, which corresponds to ceil(26/3)/4 = 2.0 c/l.
C INPUT PARAMETERS
define(`rp',`%rdi')
@@ -57,25 +39,10 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
-ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh1_n)')
-ifdef(`OPERATION_rsblsh1_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh1_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
+PROLOGUE(mpn_addlsh1_n)
push %rbp
mov (vp), %r8
@@ -97,11 +64,11 @@ L(b11): add %r8, %r8
mov 16(vp,n,8), %r10
adc %r10, %r10
sbb R32(%rax), R32(%rax) C save scy
- ADDSUB (up,n,8), %r8
- ADCSBB 8(up,n,8), %r9
+ add (up,n,8), %r8
+ adc 8(up,n,8), %r9
mov %r8, (rp,n,8)
mov %r9, 8(rp,n,8)
- ADCSBB 16(up,n,8), %r10
+ adc 16(up,n,8), %r10
mov %r10, 16(rp,n,8)
sbb R32(%rbp), R32(%rbp) C save acy
add $3, n
@@ -111,8 +78,8 @@ L(b10): add %r8, %r8
mov 8(vp,n,8), %r9
adc %r9, %r9
sbb R32(%rax), R32(%rax) C save scy
- ADDSUB (up,n,8), %r8
- ADCSBB 8(up,n,8), %r9
+ add (up,n,8), %r8
+ adc 8(up,n,8), %r9
mov %r8, (rp,n,8)
mov %r9, 8(rp,n,8)
sbb R32(%rbp), R32(%rbp) C save acy
@@ -121,7 +88,7 @@ L(b10): add %r8, %r8
L(b01): add %r8, %r8
sbb R32(%rax), R32(%rax) C save scy
- ADDSUB (up,n,8), %r8
+ add (up,n,8), %r8
mov %r8, (rp,n,8)
sbb R32(%rbp), R32(%rbp) C save acy
inc n
@@ -142,13 +109,13 @@ L(b00): adc %r8, %r8
sbb R32(%rax), R32(%rax) C save scy
add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up,n,8), %r8
+ adc (up,n,8), %r8
nop C Hammer speedup!
- ADCSBB 8(up,n,8), %r9
+ adc 8(up,n,8), %r9
mov %r8, (rp,n,8)
mov %r9, 8(rp,n,8)
- ADCSBB 16(up,n,8), %r10
- ADCSBB 24(up,n,8), %r11
+ adc 16(up,n,8), %r10
+ adc 24(up,n,8), %r11
mov %r10, 16(rp,n,8)
mov %r11, 24(rp,n,8)
@@ -156,15 +123,9 @@ L(b00): adc %r8, %r8
add $4, n
js L(top)
-L(end):
-ifdef(`OPERATION_addlsh1_n',`
- add R32(%rbp), R32(%rax)
- neg R32(%rax)')
-ifdef(`OPERATION_rsblsh1_n',`
- sub R32(%rax), R32(%rbp)
- movslq R32(%rbp), %rax')
+L(end): add R32(%rbp), R32(%rax)
+ neg R32(%rax)
pop %rbp
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/addmul_2.asm b/gmp/mpn/x86_64/addmul_2.asm
index 18307d719f..8f133c3b00 100644
--- a/gmp/mpn/x86_64/addmul_2.asm
+++ b/gmp/mpn/x86_64/addmul_2.asm
@@ -1,51 +1,39 @@
dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl add the result to a third limb vector.
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.375
-C AMD K10 2.375
-C Intel P4 15-16
-C Intel core2 4.45
-C Intel NHM 4.32
-C Intel SBR 3.4
-C Intel atom ?
-C VIA nano 4.4
+C K8,K9: 2.375
+C K10: 2.375
+C P4: ?
+C P6-15: 4.45
C This code is the result of running a code generation and optimization tool
C suite written by David Harvey and Torbjorn Granlund.
C TODO
-C * Tune feed-in and wind-down code.
+C * Work on feed-in and wind-down code.
+C * Convert "mov $0" to "xor".
+C * Adjust initial lea to save some bytes.
+C * Perhaps adjust n from n_param&3 value?
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -61,124 +49,119 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- mov n_param, n
push %rbx
push %rbp
- mov 0(vp), v0
+ mov (vp), v0
mov 8(vp), v1
- mov R32(n_param), R32(%rbx)
- mov (up), %rax
- lea -8(up,n_param,8), up
- lea -8(rp,n_param,8), rp
- mul v0
+ mov n_param, n
neg n
- and $3, R32(%rbx)
- jz L(b0)
- cmp $2, R32(%rbx)
- jc L(b1)
- jz L(b2)
-
-L(b3): mov %rax, w1
+ lea -32(up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+
+ and $3, R32(n_param)
+ jz L(am2p0)
+ cmp $2, R32(n_param)
+ jc L(am2p1)
+ jz L(am2p2)
+L(am2p3):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w1
+ mov 32(up,n,8), %rax
mov %rdx, w2
xor R32(w3), R32(w3)
- mov 8(up,n,8), %rax
- dec n
- jmp L(lo3)
-
-L(b2): mov %rax, w2
- mov 8(up,n,8), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- add $-2, n
- jmp L(lo2)
-
-L(b1): mov %rax, w3
- mov 8(up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- inc n
- jmp L(lo1)
-
-L(b0): mov $0, R32(w3)
+ add $2, n
+ jmp L(am3)
+L(am2p0):
+ mov 32(up,n,8), %rax
+ mul v0
mov %rax, w0
- mov 8(up,n,8), %rax
+ mov 32(up,n,8), %rax
mov %rdx, w1
xor R32(w2), R32(w2)
- jmp L(lo0)
+ add $3, n
+ jmp L(am0)
+L(am2p1):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w3
+ mov 32(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(am1)
+L(am2p2):
+ mov 32(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov 32(up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ add $1, n
+ jmp L(am2)
ALIGN(32)
-L(top): mov $0, R32(w1)
- mul v0
- add %rax, w3
- mov (up,n,8), %rax
- adc %rdx, w0
- adc $0, R32(w1)
-L(lo1): mul v1
+L(top):
add w3, (rp,n,8)
- mov $0, R32(w3)
adc %rax, w0
- mov $0, R32(w2)
mov 8(up,n,8), %rax
adc %rdx, w1
+ mov $0, R32(w2)
mul v0
add %rax, w0
mov 8(up,n,8), %rax
adc %rdx, w1
adc $0, R32(w2)
-L(lo0): mul v1
+L(am0): mul v1
add w0, 8(rp,n,8)
adc %rax, w1
adc %rdx, w2
mov 16(up,n,8), %rax
+ mov $0, R32(w3)
mul v0
add %rax, w1
+ mov 16(up,n,8), %rax
adc %rdx, w2
adc $0, R32(w3)
- mov 16(up,n,8), %rax
-L(lo3): mul v1
+L(am3): mul v1
add w1, 16(rp,n,8)
adc %rax, w2
- adc %rdx, w3
- xor R32(w0), R32(w0)
mov 24(up,n,8), %rax
+ adc %rdx, w3
mul v0
+ mov $0, R32(w0)
add %rax, w2
- mov 24(up,n,8), %rax
adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,n,8), %rax
adc $0, R32(w0)
-L(lo2): mul v1
+L(am2): mul v1
add w2, 24(rp,n,8)
adc %rax, w3
adc %rdx, w0
mov 32(up,n,8), %rax
- add $4, n
- js L(top)
-
-L(end): xor R32(w1), R32(w1)
mul v0
add %rax, w3
- mov (up), %rax
+ mov 32(up,n,8), %rax
adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w3, (rp)
+ adc $0, R32(w1)
+L(am1): mul v1
+ add $4, n
+ js L(top)
+
+ add w3, (rp,n,8)
adc %rax, w0
adc %rdx, w1
- mov w0, 8(rp)
+ mov w0, 8(rp,n,8)
mov w1, %rax
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/aorrlsh2_n.asm b/gmp/mpn/x86_64/aorrlsh2_n.asm
deleted file mode 100644
index 999e972fb4..0000000000
--- a/gmp/mpn/x86_64/aorrlsh2_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
-dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009-2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n',`
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh2_n)')
-ifdef(`OPERATION_rsblsh2_n',`
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh2_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/aorrlshC_n.asm b/gmp/mpn/x86_64/aorrlshC_n.asm
deleted file mode 100644
index 5a9fd4dfb9..0000000000
--- a/gmp/mpn/x86_64/aorrlshC_n.asm
+++ /dev/null
@@ -1,160 +0,0 @@
-dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
-dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
-
-dnl Copyright 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-C cycles/limb
-C AMD K8,K9 2
-C AMD K10 2
-C Intel P4 ?
-C Intel core2 3
-C Intel NHM 2.75
-C Intel SBR 2.55
-C Intel atom ?
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-
-define(M, eval(m4_lshift(1,LSH)))
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (vp), %r8
- lea (,%r8,M), %r12
- shr $RSH, %r8
-
- mov R32(n), R32(%rax)
- lea (rp,n,8), rp
- lea (up,n,8), up
- lea (vp,n,8), vp
- neg n
- and $3, R8(%rax)
- je L(b00)
- cmp $2, R8(%rax)
- jc L(b01)
- je L(b10)
-
-L(b11): mov 8(vp,n,8), %r10
- lea (%r8,%r10,M), %r14
- shr $RSH, %r10
- mov 16(vp,n,8), %r11
- lea (%r10,%r11,M), %r15
- shr $RSH, %r11
- ADDSUB (up,n,8), %r12
- ADCSBB 8(up,n,8), %r14
- ADCSBB 16(up,n,8), %r15
- sbb R32(%rax), R32(%rax) C save carry for next
- mov %r12, (rp,n,8)
- mov %r14, 8(rp,n,8)
- mov %r15, 16(rp,n,8)
- add $3, n
- js L(top)
- jmp L(end)
-
-L(b01): mov %r8, %r11
- ADDSUB (up,n,8), %r12
- sbb R32(%rax), R32(%rax) C save carry for next
- mov %r12, (rp,n,8)
- add $1, n
- js L(top)
- jmp L(end)
-
-L(b10): mov 8(vp,n,8), %r11
- lea (%r8,%r11,M), %r15
- shr $RSH, %r11
- ADDSUB (up,n,8), %r12
- ADCSBB 8(up,n,8), %r15
- sbb R32(%rax), R32(%rax) C save carry for next
- mov %r12, (rp,n,8)
- mov %r15, 8(rp,n,8)
- add $2, n
- js L(top)
- jmp L(end)
-
-L(b00): mov 8(vp,n,8), %r9
- mov 16(vp,n,8), %r10
- jmp L(e00)
-
- ALIGN(16)
-L(top): mov 16(vp,n,8), %r10
- mov (vp,n,8), %r8
- mov 8(vp,n,8), %r9
- lea (%r11,%r8,M), %r12
- shr $RSH, %r8
-L(e00): lea (%r8,%r9,M), %r13
- shr $RSH, %r9
- mov 24(vp,n,8), %r11
- lea (%r9,%r10,M), %r14
- shr $RSH, %r10
- lea (%r10,%r11,M), %r15
- shr $RSH, %r11
- add R32(%rax), R32(%rax) C restore carry
- ADCSBB (up,n,8), %r12
- ADCSBB 8(up,n,8), %r13
- ADCSBB 16(up,n,8), %r14
- ADCSBB 24(up,n,8), %r15
- mov %r12, (rp,n,8)
- mov %r13, 8(rp,n,8)
- mov %r14, 16(rp,n,8)
- sbb R32(%rax), R32(%rax) C save carry for next
- mov %r15, 24(rp,n,8)
- add $4, n
- js L(top)
-L(end):
-
-ifelse(ADDSUB,add,`
- sub R32(%r11), R32(%rax)
- neg R32(%rax)
-',`
- add R32(%r11), R32(%rax)
- movslq R32(%rax), %rax
-')
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/aorrlsh_n.asm b/gmp/mpn/x86_64/aorrlsh_n.asm
index 5ca128fbf3..55176f7aa1 100644
--- a/gmp/mpn/x86_64/aorrlsh_n.asm
+++ b/gmp/mpn/x86_64/aorrlsh_n.asm
@@ -1,45 +1,32 @@
dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+dnl ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which
+dnl subtacts the shifted operand from the unshifted operand.)
-dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 3.1 < 3.85 for lshift + add_n
-C AMD K10 3.1 < 3.85 for lshift + add_n
-C Intel P4 14.6 > 7.33 for lshift + add_n
-C Intel core2 3.87 > 3.27 for lshift + add_n
-C Intel NHM 4 > 3.75 for lshift + add_n
-C Intel SBR (5.8) > 3.46 for lshift + add_n
-C Intel atom (7.75) < 8.75 for lshift + add_n
-C VIA nano 4.7 < 6.25 for lshift + add_n
+C K8,K9: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l)
+C K10: 3.25 (mpn_lshift + mpn_add_n costs about 4.1 c/l)
+C P4: 14
+C P6-15: 4
C This was written quickly and not optimized at all. Surely one could get
C closer to 3 c/l or perhaps even under 3 c/l. Ideas:
@@ -54,67 +41,65 @@ define(`rp', `%rdi')
define(`up', `%rsi')
define(`vp', `%rdx')
define(`n', `%rcx')
-define(`cnt', `%r8')
+define(`cnt' `%r8')
ifdef(`OPERATION_addlsh_n',`
- define(ADCSBB, `adc')
+ define(ADDSUBC, `adc')
define(func, mpn_addlsh_n)
')
ifdef(`OPERATION_rsblsh_n',`
- define(ADCSBB, `sbb')
+ define(ADDSUBC, `sbb')
define(func, mpn_rsblsh_n)
')
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
+
push %r12
push %r13
push %r14
- push %rbp
+ push %r15
push %rbx
mov n, %rax
- xor R32(%rbx), R32(%rbx) C clear carry save register
- mov R32(%r8), R32(%rcx) C shift count
- xor R32(%rbp), R32(%rbp) C limb carry
+ xor %ebx, %ebx C clear carry save register
+ mov %r8d, %ecx C shift count
+ xor %r15d, %r15d C limb carry
- mov R32(%rax), R32(%r11)
- and $3, R32(%r11)
+ mov %eax, %r11d
+ and $3, %r11d
je L(4)
- sub $1, R32(%r11)
+ sub $1, %r11d
-L(012): mov (vp), %r8
+L(oopette):
+ mov 0(vp), %r8
mov %r8, %r12
- shl R8(%rcx), %r8
- or %rbp, %r8
- neg R8(%rcx)
- mov %r12, %rbp
- shr R8(%rcx), %rbp
- neg R8(%rcx)
- add R32(%rbx), R32(%rbx)
- ADCSBB (up), %r8
- mov %r8, (rp)
- sbb R32(%rbx), R32(%rbx)
+ shl %cl, %r8
+ or %r15, %r8
+ neg %cl
+ mov %r12, %r15
+ shr %cl, %r15
+ neg %cl
+ add %ebx, %ebx
+ ADDSUBC 0(up), %r8
+ mov %r8, 0(rp)
+ sbb %ebx, %ebx
lea 8(up), up
lea 8(vp), vp
lea 8(rp), rp
- sub $1, R32(%r11)
- jnc L(012)
+ sub $1, %r11d
+ jnc L(oopette)
-L(4): sub $4, %rax
+L(4):
+ sub $4, %rax
jc L(end)
- ALIGN(16)
-L(top): mov (vp), %r8
+L(oop):
+ mov 0(vp), %r8
mov %r8, %r12
mov 8(vp), %r9
mov %r9, %r13
@@ -122,55 +107,55 @@ L(top): mov (vp), %r8
mov %r10, %r14
mov 24(vp), %r11
- shl R8(%rcx), %r8
- shl R8(%rcx), %r9
- shl R8(%rcx), %r10
- or %rbp, %r8
- mov %r11, %rbp
- shl R8(%rcx), %r11
+ shl %cl, %r8
+ shl %cl, %r9
+ shl %cl, %r10
+ or %r15, %r8
+ mov %r11, %r15
+ shl %cl, %r11
- neg R8(%rcx)
+ neg %cl
- shr R8(%rcx), %r12
- shr R8(%rcx), %r13
- shr R8(%rcx), %r14
- shr R8(%rcx), %rbp C used next iteration
+ shr %cl, %r12
+ shr %cl, %r13
+ shr %cl, %r14
+ shr %cl, %r15 C used next loop
or %r12, %r9
or %r13, %r10
or %r14, %r11
- neg R8(%rcx)
+ neg %cl
- add R32(%rbx), R32(%rbx) C restore carry flag
+ add %ebx, %ebx C restore carry flag
- ADCSBB (up), %r8
- ADCSBB 8(up), %r9
- ADCSBB 16(up), %r10
- ADCSBB 24(up), %r11
+ ADDSUBC 0(up), %r8
+ ADDSUBC 8(up), %r9
+ ADDSUBC 16(up), %r10
+ ADDSUBC 24(up), %r11
- mov %r8, (rp)
+ mov %r8, 0(rp)
mov %r9, 8(rp)
mov %r10, 16(rp)
mov %r11, 24(rp)
- sbb R32(%rbx), R32(%rbx) C save carry flag
+ sbb %ebx, %ebx C save carry flag
lea 32(up), up
lea 32(vp), vp
lea 32(rp), rp
sub $4, %rax
- jnc L(top)
-
-L(end): add R32(%rbx), R32(%rbx)
- ADCSBB $0, %rbp
- mov %rbp, %rax
+ jnc L(oop)
+L(end):
+ add %ebx, %ebx
+ adc $0, %r15
+ mov %r15, %rax
pop %rbx
- pop %rbp
+ pop %r15
pop %r14
pop %r13
pop %r12
- FUNC_EXIT()
+
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/aors_err1_n.asm b/gmp/mpn/x86_64/aors_err1_n.asm
deleted file mode 100644
index 54d0b3f9b7..0000000000
--- a/gmp/mpn/x86_64/aors_err1_n.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 2.75 (degenerates to 3 c/l for some alignments)
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`ep', `%rcx')
-define(`yp', `%r8')
-define(`n', `%r9')
-define(`cy_param', `8(%rsp)')
-
-define(`el', `%rbx')
-define(`eh', `%rbp')
-define(`t0', `%r10')
-define(`t1', `%r11')
-define(`t2', `%r12')
-define(`t3', `%r13')
-define(`w0', `%r14')
-define(`w1', `%r15')
-
-ifdef(`OPERATION_add_err1_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_err1_n)')
-ifdef(`OPERATION_sub_err1_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_err1_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov cy_param, %rax
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
-
- mov R32(n), R32(%r10)
- and $3, R32(%r10)
- jz L(0mod4)
- cmp $2, R32(%r10)
- jc L(1mod4)
- jz L(2mod4)
-L(3mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- xor R32(t1), R32(t1)
- lea -24(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 16(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc 8(yp), t0
- mov 16(up,n,8), w0
- ADCSBB 16(vp,n,8), w0
- mov w0, 16(rp,n,8)
- cmovc (yp), t1
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
-
- add $3, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(0mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea (yp,n,8), yp
- neg n
- jmp L(loop)
-
- ALIGN(16)
-L(1mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea -8(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc (yp), el
- setc %al C save carry
-
- add $1, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(2mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- lea -16(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 8(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc (yp), t0
- setc %al C save carry
- add t0, el
- adc $0, eh
-
- add $2, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(32)
-L(loop):
- shr $1, %al C restore carry
- mov -8(yp), t0
- mov $0, R32(t3)
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- cmovnc t3, t0
- ADCSBB 8(vp,n,8), w1
- mov -16(yp), t1
- mov w0, (rp,n,8)
- mov 16(up,n,8), w0
- mov w1, 8(rp,n,8)
- cmovnc t3, t1
- mov -24(yp), t2
- ADCSBB 16(vp,n,8), w0
- cmovnc t3, t2
- mov 24(up,n,8), w1
- ADCSBB 24(vp,n,8), w1
- cmovc -32(yp), t3
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
- add t2, el
- adc $0, eh
- mov w0, 16(rp,n,8)
- add t3, el
- lea -32(yp), yp
- adc $0, eh
- mov w1, 24(rp,n,8)
- add $4, n
- jnz L(loop)
-
-L(end):
- mov el, (ep)
- mov eh, 8(ep)
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/aors_err2_n.asm b/gmp/mpn/x86_64/aors_err2_n.asm
deleted file mode 100644
index ce5c2a49b6..0000000000
--- a/gmp/mpn/x86_64/aors_err2_n.asm
+++ /dev/null
@@ -1,172 +0,0 @@
-dnl AMD64 mpn_add_err2_n, mpn_sub_err2_n
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4.5
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 6.9
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`ep', `%rcx')
-define(`yp1', `%r8')
-define(`yp2', `%r9')
-define(`n_param', `8(%rsp)')
-define(`cy_param', `16(%rsp)')
-
-define(`cy1', `%r14')
-define(`cy2', `%rax')
-
-define(`n', `%r10')
-
-define(`w', `%rbx')
-define(`e1l', `%rbp')
-define(`e1h', `%r11')
-define(`e2l', `%r12')
-define(`e2h', `%r13')
-
-
-ifdef(`OPERATION_add_err2_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_err2_n)')
-ifdef(`OPERATION_sub_err2_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_err2_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n)
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov cy_param, cy2
- mov n_param, n
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- xor R32(e1l), R32(e1l)
- xor R32(e1h), R32(e1h)
- xor R32(e2l), R32(e2l)
- xor R32(e2h), R32(e2h)
-
- sub yp1, yp2
-
- lea (rp,n,8), rp
- lea (up,n,8), up
- lea (vp,n,8), vp
-
- test $1, n
- jnz L(odd)
-
- lea -8(yp1,n,8), yp1
- neg n
- jmp L(top)
-
- ALIGN(16)
-L(odd):
- lea -16(yp1,n,8), yp1
- neg n
- shr $1, cy2
- mov (up,n,8), w
- ADCSBB (vp,n,8), w
- cmovc 8(yp1), e1l
- cmovc 8(yp1,yp2), e2l
- mov w, (rp,n,8)
- sbb cy2, cy2
- inc n
- jz L(end)
-
- ALIGN(16)
-L(top):
- mov (up,n,8), w
- shr $1, cy2 C restore carry
- ADCSBB (vp,n,8), w
- mov w, (rp,n,8)
- sbb cy1, cy1 C generate mask, preserve CF
-
- mov 8(up,n,8), w
- ADCSBB 8(vp,n,8), w
- mov w, 8(rp,n,8)
- sbb cy2, cy2 C generate mask, preserve CF
-
- mov (yp1), w C (e1h:e1l) += cy1 * yp1 limb
- and cy1, w
- add w, e1l
- adc $0, e1h
-
- and (yp1,yp2), cy1 C (e2h:e2l) += cy1 * yp2 limb
- add cy1, e2l
- adc $0, e2h
-
- mov -8(yp1), w C (e1h:e1l) += cy2 * next yp1 limb
- and cy2, w
- add w, e1l
- adc $0, e1h
-
- mov -8(yp1,yp2), w C (e2h:e2l) += cy2 * next yp2 limb
- and cy2, w
- add w, e2l
- adc $0, e2h
-
- add $2, n
- lea -16(yp1), yp1
- jnz L(top)
-L(end):
-
- mov e1l, (ep)
- mov e1h, 8(ep)
- mov e2l, 16(ep)
- mov e2h, 24(ep)
-
- and $1, %eax C return carry
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/aors_err3_n.asm b/gmp/mpn/x86_64/aors_err3_n.asm
deleted file mode 100644
index bb6d0c5366..0000000000
--- a/gmp/mpn/x86_64/aors_err3_n.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-dnl AMD64 mpn_add_err3_n, mpn_sub_err3_n
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 7.0
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`ep', `%rcx')
-define(`yp1', `%r8')
-define(`yp2', `%r9')
-define(`yp3_param', `8(%rsp)')
-define(`n_param', `16(%rsp)')
-define(`cy_param', `24(%rsp)')
-
-define(`n', `%r10')
-define(`yp3', `%rcx')
-define(`t', `%rbx')
-
-define(`e1l', `%rbp')
-define(`e1h', `%r11')
-define(`e2l', `%r12')
-define(`e2h', `%r13')
-define(`e3l', `%r14')
-define(`e3h', `%r15')
-
-
-
-ifdef(`OPERATION_add_err3_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_err3_n)')
-ifdef(`OPERATION_sub_err3_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_err3_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err3_n mpn_sub_err3_n)
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov cy_param, %rax
- mov n_param, n
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- push ep
- mov 64(%rsp), yp3 C load from yp3_param
-
- xor R32(e1l), R32(e1l)
- xor R32(e1h), R32(e1h)
- xor R32(e2l), R32(e2l)
- xor R32(e2h), R32(e2h)
- xor R32(e3l), R32(e3l)
- xor R32(e3h), R32(e3h)
-
- sub yp1, yp2
- sub yp1, yp3
-
- lea -8(yp1,n,8), yp1
- lea (rp,n,8), rp
- lea (up,n,8), up
- lea (vp,n,8), vp
- neg n
-
- ALIGN(16)
-L(top):
- shr $1, %rax C restore carry
- mov (up,n,8), %rax
- ADCSBB (vp,n,8), %rax
- mov %rax, (rp,n,8)
- sbb %rax, %rax C save carry and generate mask
-
- mov (yp1), t
- and %rax, t
- add t, e1l
- adc $0, e1h
-
- mov (yp1,yp2), t
- and %rax, t
- add t, e2l
- adc $0, e2h
-
- mov (yp1,yp3), t
- and %rax, t
- add t, e3l
- adc $0, e3h
-
- lea -8(yp1), yp1
- inc n
- jnz L(top)
-
-L(end):
- and $1, %eax
- pop ep
-
- mov e1l, (ep)
- mov e1h, 8(ep)
- mov e2l, 16(ep)
- mov e2h, 24(ep)
- mov e3l, 32(ep)
- mov e3h, 40(ep)
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/aors_n.asm b/gmp/mpn/x86_64/aors_n.asm
index 8941f7a17b..dae5408ba4 100644
--- a/gmp/mpn/x86_64/aors_n.asm
+++ b/gmp/mpn/x86_64/aors_n.asm
@@ -1,57 +1,40 @@
dnl AMD64 mpn_add_n, mpn_sub_n
-dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 1.5
-C AMD K10 1.5
-C AMD bd1 1.8
-C AMD bobcat 2.5
-C Intel P4
-C Intel core2 4.9
-C Intel NHM 5.5
-C Intel SBR 1.61
-C Intel IBR 1.61
-C Intel atom 4
-C VIA nano 3.25
-
-C The loop of this code is the result of running a code generation and
+C K8,K9: 1.5
+C K10: 1.5
+C P4: ?
+C P6-15 (Core2): 4.9
+C P6-28 (Atom): 4
+
+C The inner loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C INPUT PARAMETERS
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`vp', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8') C (only for mpn_add_nc)
ifdef(`OPERATION_add_n', `
define(ADCSBB, adc)
@@ -64,71 +47,29 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
mov R32(n), R32(%rax)
- shr $2, n
and $3, R32(%rax)
+ shr $2, n
bt $0, %r8 C cy flag <- carry parameter
- jrcxz L(lt4)
-
- mov (up), %r8
- mov 8(up), %r9
- dec n
- jmp L(mid)
-
+ jz L(1)
+ jmp L(ent)
EPILOGUE()
ALIGN(16)
PROLOGUE(func)
- FUNC_ENTRY(4)
mov R32(n), R32(%rax)
shr $2, n
+ jz L(0)
and $3, R32(%rax)
- jrcxz L(lt4)
- mov (up), %r8
+L(ent): mov (up), %r8
mov 8(up), %r9
dec n
jmp L(mid)
-L(lt4): dec R32(%rax)
- mov (up), %r8
- jnz L(2)
- ADCSBB (vp), %r8
- mov %r8, (rp)
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
- ret
-
-L(2): dec R32(%rax)
- mov 8(up), %r9
- jnz L(3)
- ADCSBB (vp), %r8
- ADCSBB 8(vp), %r9
- mov %r8, (rp)
- mov %r9, 8(rp)
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
- ret
-
-L(3): mov 16(up), %r10
- ADCSBB (vp), %r8
- ADCSBB 8(vp), %r9
- ADCSBB 16(vp), %r10
- mov %r8, (rp)
- mov %r9, 8(rp)
- mov %r10, 16(rp)
- setc R8(%rax)
- FUNC_EXIT()
- ret
-
ALIGN(16)
L(top): ADCSBB (vp), %r8
ADCSBB 8(vp), %r9
@@ -162,8 +103,36 @@ L(end): lea 32(up), up
inc R32(%rax)
dec R32(%rax)
- jnz L(lt4)
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+ jnz L(1)
+ adc %eax, %eax
+ ret
+
+L(0): test R32(%rax), R32(%rax)
+L(1): dec R32(%rax)
+ mov (up), %r8
+ jnz L(2)
+ ADCSBB (vp), %r8
+ mov %r8, (rp)
+ adc %eax, %eax
+ ret
+
+L(2): dec R32(%rax)
+ mov 8(up), %r9
+ jnz L(3)
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ adc %eax, %eax
+ ret
+
+L(3): mov 16(up), %r10
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ setc %al
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/aorsmul_1.asm b/gmp/mpn/x86_64/aorsmul_1.asm
index e3fc005757..a25c74ebdc 100644
--- a/gmp/mpn/x86_64/aorsmul_1.asm
+++ b/gmp/mpn/x86_64/aorsmul_1.asm
@@ -1,60 +1,45 @@
dnl AMD64 mpn_addmul_1 and mpn_submul_1.
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.5
-C AMD K10 2.5
-C AMD bd1 5.0
-C AMD bobcat 6.17
-C Intel P4 14.9
-C Intel core2 5.09
-C Intel NHM 4.9
-C Intel SBR 4.0
-C Intel atom 21.3
-C VIA nano 5.0
-
-C The loop of this code is the result of running a code generation and
+C K8,K9: 2.5
+C K10: 2.5
+C P4: 14.9
+C P6-15 (Core2): 5.09
+C P6-28 (Atom): 21.3
+
+C The inner loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO
-C * The loop is great, but the prologue and epilogue code was quickly written.
-C Tune it!
+C TODO:
+C * The inner loop is great, but the prologue and epilogue code was
+C quickly written. Tune it!
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vl', `%rcx') C r9
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vl', `%rcx')
-define(`n', `%r11')
+define(`n', `%r11')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
@@ -65,33 +50,17 @@ ifdef(`OPERATION_submul_1',`
define(`func', `mpn_submul_1')
')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`vl', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
-
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
mov (up), %rax C read first u limb early
push %rbx
-IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
-IFDOS(` mov n, %rbx ')
+ mov n_param, %rbx C move away n from rdx, mul uses it
mul vl
-IFSTD(` mov %rbx, n ')
+ mov %rbx, %r11
and $3, R32(%rbx)
jz L(b0)
@@ -143,7 +112,7 @@ L(top): ADDSUB %r10, (rp,n,8)
adc %rax, %r9
mov (up,n,8), %rax
adc %rdx, %r8
- mov $0, R32(%r10)
+ mov $0, %r10d
L(L1): mul vl
ADDSUB %r9, 8(rp,n,8)
adc %rax, %r8
@@ -156,11 +125,11 @@ L(L0): mov 8(up,n,8), %rax
L(L3): mov 16(up,n,8), %rax
mul vl
ADDSUB %rbx, 24(rp,n,8)
- mov $0, R32(%r8) C zero
- mov %r8, %rbx C zero
+ mov $0, %r8d # zero
+ mov %r8, %rbx # zero
adc %rax, %r10
mov 24(up,n,8), %rax
- mov %r8, %r9 C zero
+ mov %r8, %r9 # zero
adc %rdx, %r9
L(L2): mul vl
add $4, n
@@ -174,7 +143,5 @@ L(ret): adc $0, %rdx
mov %rdx, %rax
pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/addmul_2.asm b/gmp/mpn/x86_64/atom/addmul_2.asm
deleted file mode 100644
index c1dcdc44aa..0000000000
--- a/gmp/mpn/x86_64/atom/addmul_2.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-dnl AMD64 mpn_addmul_2 optimised for Intel Atom.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bd2
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel PNR
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom 18.8 this
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vp', `%rcx') C r9
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %rax
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov n_param, n
- mul v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jnz L(b10)
-
-L(b00): mov %rax, w0
- mov (up), %rax
- mov %rdx, w1
- xor R32(w2), R32(w2)
- lea -8(rp), rp
- jmp L(lo0)
-
-L(b10): mov %rax, w2
- mov (up), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- lea -16(up), up
- lea -24(rp), rp
- jmp L(lo2)
-
-L(bx1): test $2, R8(n)
- jnz L(b11)
-
-L(b01): mov %rax, w3
- mov %rdx, w0
- mov (up), %rax
- xor R32(w1), R32(w1)
- lea 8(up), up
- dec n
- jmp L(lo1)
-
-L(b11): mov %rax, w1
- mov (up), %rax
- mov %rdx, w2
- xor R32(w3), R32(w3)
- lea -8(up), up
- lea -16(rp), rp
- jmp L(lo3)
-
- ALIGN(16)
-L(top):
-L(lo1): mul v1
- add w3, (rp)
- mov $0, R32(w2)
- adc %rax, w0
- mov (up), %rax
- adc %rdx, w1
- mul v0
- add %rax, w0
- mov (up), %rax
- adc %rdx, w1
- adc $0, R32(w2)
-L(lo0): mul v1
- add w0, 8(rp)
- adc %rax, w1
- mov 8(up), %rax
- mov $0, R32(w3)
- adc %rdx, w2
- mul v0
- add %rax, w1
- mov 8(up), %rax
- adc %rdx, w2
- adc $0, R32(w3)
-L(lo3): mul v1
- add w1, 16(rp)
- adc %rax, w2
- mov 16(up), %rax
- mov $0, R32(w0)
- adc %rdx, w3
- mul v0
- add %rax, w2
- mov 16(up), %rax
- adc %rdx, w3
- adc $0, R32(w0)
-L(lo2): mul v1
- add w2, 24(rp)
- adc %rax, w3
- mov 24(up), %rax
- adc %rdx, w0
- mov $0, R32(w1)
- lea 32(rp), rp
- mul v0
- lea 32(up), up
- add %rax, w3
- adc %rdx, w0
- mov -8(up), %rax
- adc $0, R32(w1)
- sub $4, n
- ja L(top)
-
-L(end): mul v1
- add w3, (rp)
- adc %rax, w0
- adc %rdx, w1
- mov w0, 8(rp)
- mov w1, %rax
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/aorrlsh1_n.asm b/gmp/mpn/x86_64/atom/aorrlsh1_n.asm
deleted file mode 100644
index f44de19fef..0000000000
--- a/gmp/mpn/x86_64/atom/aorrlsh1_n.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom.
-dnl Used also for AMD bd1.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO
-C * This code is slightly large at 433 bytes.
-C * sublsh1_n.asm and this file use the same basic pattern.
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bd1 2.3
-C AMD bobcat ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 4.875 (4.75 is probably possible)
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8')
-
-ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_addlsh1_n)
- define(func_nc, mpn_addlsh1_nc)')
-ifdef(`OPERATION_rsblsh1_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsblsh1_n)
- define(func_nc, mpn_rsblsh1_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbp
- xor R32(%rbp), R32(%rbp)
-L(ent): mov R32(n), R32(%rax)
- and $3, R32(%rax)
- jz L(b0)
- cmp $2, R32(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mov (vp), %r8
- add %r8, %r8
- lea 8(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up), %r8
- mov %r8, (rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 8(up), up
- lea 8(rp), rp
- jmp L(b0)
-
-L(b2): mov (vp), %r8
- add %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- lea 16(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up), %r8
- mov %r8, (rp)
- ADCSBB 8(up), %r9
- mov %r9, 8(rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 16(up), up
- lea 16(rp), rp
- jmp L(b0)
-
-L(b3): mov (vp), %r8
- add %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- mov 16(vp), %r10
- adc %r10, %r10
- lea 24(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up), %r8
- mov %r8, (rp)
- ADCSBB 8(up), %r9
- mov %r9, 8(rp)
- ADCSBB 16(up), %r10
- mov %r10, 16(rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 24(up), up
- lea 24(rp), rp
-
-L(b0): test $4, R8(n)
- jz L(skp)
- add R32(%rax), R32(%rax) C restore scy
- mov (vp), %r8
- adc %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- mov 16(vp), %r10
- adc %r10, %r10
- mov 24(vp), %r11
- adc %r11, %r11
- lea 32(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up), %r8
- mov %r8, (rp)
- ADCSBB 8(up), %r9
- mov %r9, 8(rp)
- ADCSBB 16(up), %r10
- mov %r10, 16(rp)
- ADCSBB 24(up), %r11
- mov %r11, 24(rp)
- lea 32(up), up
- lea 32(rp), rp
- sbb R32(%rbp), R32(%rbp) C save acy
-
-L(skp): cmp $8, n
- jl L(rtn)
-
- push %r12
- push %r13
- push %r14
- push %rbx
- lea -64(rp), rp
- jmp L(x)
-
- ALIGN(16)
-L(top): add R32(%rax), R32(%rax) C restore scy
- lea 64(rp), rp
- mov (vp), %r8
- adc %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- mov 16(vp), %r10
- adc %r10, %r10
- mov 24(vp), %r11
- adc %r11, %r11
- mov 32(vp), %r12
- adc %r12, %r12
- mov 40(vp), %r13
- adc %r13, %r13
- mov 48(vp), %r14
- adc %r14, %r14
- mov 56(vp), %rbx
- adc %rbx, %rbx
- lea 64(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- ADCSBB (up), %r8
- mov %r8, (rp)
- ADCSBB 8(up), %r9
- mov %r9, 8(rp)
- ADCSBB 16(up), %r10
- mov %r10, 16(rp)
- ADCSBB 24(up), %r11
- mov %r11, 24(rp)
- ADCSBB 32(up), %r12
- mov %r12, 32(rp)
- ADCSBB 40(up), %r13
- mov %r13, 40(rp)
- ADCSBB 48(up), %r14
- mov %r14, 48(rp)
- ADCSBB 56(up), %rbx
- mov %rbx, 56(rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 64(up), up
-L(x): sub $8, n
- jge L(top)
-
-L(end): pop %rbx
- pop %r14
- pop %r13
- pop %r12
-L(rtn):
-ifdef(`OPERATION_addlsh1_n',`
- add R32(%rbp), R32(%rax)
- neg R32(%rax)')
-ifdef(`OPERATION_rsblsh1_n',`
- sub R32(%rax), R32(%rbp)
- movslq R32(%rbp), %rax')
-
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbp
- neg %r8 C set CF
- sbb R32(%rbp), R32(%rbp) C save acy
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/aorrlsh2_n.asm b/gmp/mpn/x86_64/atom/aorrlsh2_n.asm
deleted file mode 100644
index 02fb29dd74..0000000000
--- a/gmp/mpn/x86_64/atom/aorrlsh2_n.asm
+++ /dev/null
@@ -1,191 +0,0 @@
-dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
-dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
-dnl Optimised for Intel Atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 5.75
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-
-define(`LSH', 2)
-define(`RSH', 62)
-define(M, eval(m4_lshift(1,LSH)))
-
-ifdef(`OPERATION_addlsh2_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_addlsh2_n)
- define(func_nc, mpn_addlsh2_nc)')
-ifdef(`OPERATION_rsblsh2_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsblsh2_n)
- define(func_nc, mpn_rsblsh2_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov R32(n), R32(%rax)
- and $3, R32(%rax)
- jz L(b0) C we rely on rax = 0 at target
- cmp $2, R32(%rax)
- mov $0, R32(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mov (vp), %r9
- lea (%rax,%r9,M), %rbp
- shr $RSH, %r9
- sub $1, n
- lea -8(up), up
- lea -8(rp), rp
- jz L(cj1)
- mov 8(vp), %r10
- lea (%r9,%r10,M), %r9
- shr $RSH, %r10
- mov 16(vp), %r11
- lea 24(vp), vp
- mov (vp), %r8
- lea (%r10,%r11,M), %r10
- shr $RSH, %r11
- add R32(%rax), R32(%rax)
- jmp L(L1)
-
-L(b2): lea -32(rp), rp
- mov (vp), %r8
- lea -32(up), up
- lea (%rax,%r8,M), %rbx
- shr $RSH, %r8
- mov 8(vp), %r9
- sub $2, n
- jle L(end)
- jmp L(top)
-
-L(b3): lea -24(up), up
- mov (vp), %r11
- lea -24(rp), rp
- mov 8(vp), %r8
- lea (%rax,%r11,M), %r10
- shr $RSH, %r11
- lea 8(vp), vp
- lea (%r11,%r8,M), %rbx
- add $1, n
- jmp L(L3)
-
-L(b0): lea -16(up), up
- mov (vp), %r10
- lea (%rax,%r10,M), %r9
- shr $RSH, %r10
- mov 8(vp), %r11
- lea -16(rp), rp
- mov 16(vp), %r8
- lea (%r10,%r11,M), %r10
- shr $RSH, %r11
- add R32(%rax), R32(%rax)
- lea 16(vp), vp
- jmp L(L0)
-
- ALIGN(16)
-L(top): lea (%r8,%r9,M), %rbp
- shr $RSH, %r9
- lea 32(up), up
- mov 16(vp), %r10
- lea (%r9,%r10,M), %r9
- shr $RSH, %r10
- mov 24(vp), %r11
- lea 32(rp), rp
- lea 32(vp), vp
- mov (vp), %r8
- lea (%r10,%r11,M), %r10
- shr $RSH, %r11
- add R32(%rax), R32(%rax)
- ADCSBB (up), %rbx
- mov %rbx, (rp)
-L(L1): ADCSBB 8(up), %rbp
- mov %rbp, 8(rp)
-L(L0): ADCSBB 16(up), %r9
- lea (%r11,%r8,M), %rbx
- mov %r9, 16(rp)
-L(L3): ADCSBB 24(up), %r10
- sbb R32(%rax), R32(%rax)
-L(L2): shr $RSH, %r8
- mov 8(vp), %r9
- mov %r10, 24(rp)
- sub $4, n
- jg L(top)
-
-L(end): lea (%r8,%r9,M), %rbp
- shr $RSH, %r9
- lea 32(up), up
- lea 32(rp), rp
- add R32(%rax), R32(%rax)
- ADCSBB (up), %rbx
- mov %rbx, (rp)
-L(cj1): ADCSBB 8(up), %rbp
- mov %rbp, 8(rp)
-
-ifdef(`OPERATION_addlsh2_n',`
- mov R32(n), R32(%rax) C zero rax
- adc %r9, %rax')
-ifdef(`OPERATION_rsblsh2_n',`
- sbb n, %r9 C subtract 0
- mov %r9, %rax')
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/aors_n.asm b/gmp/mpn/x86_64/atom/aors_n.asm
index 2c0b7b31a8..32c19424f0 100644
--- a/gmp/mpn/x86_64/atom/aors_n.asm
+++ b/gmp/mpn/x86_64/atom/aors_n.asm
@@ -1,37 +1,142 @@
dnl X86-64 mpn_add_n, mpn_sub_n, optimized for Intel Atom.
-dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+
+C cycles/limb
+C K8,K9: 1.85
+C K10: ?
+C P4: ?
+C P6-15 (Core2): ?
+C P6-28 (Atom): 3
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8') C (only for mpn_add_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-include_mpn(`x86_64/coreisbr/aors_n.asm')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ jmp L(ent)
+EPILOGUE()
+PROLOGUE(func)
+ xor %r8, %r8
+L(ent):
+ mov R32(%rcx), R32(%rax)
+ shr $2, %rcx
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (%rsi), %r10
+ test R32(%rcx), R32(%rcx)
+ bt $0, R32(%r8)
+ jnz L(gt1)
+ ADCSBB (%rdx), %r10
+ mov %r10, (%rdi)
+ mov R32(%rcx), R32(%rax) C zero rax
+ adc R32(%rax), R32(%rax)
+ ret
+L(gt1): ADCSBB (%rdx), %r10
+ mov 8(%rsi), %r11
+ lea 16(%rsi), %rsi
+ lea -16(%rdx), %rdx
+ lea -16(%rdi), %rdi
+ jmp L(m1)
+
+L(b2): mov (%rsi), %r9
+ mov 8(%rsi), %r10
+ lea -8(%rdx), %rdx
+ test R32(%rcx), R32(%rcx)
+ bt $0, R32(%r8)
+ jnz L(gt2)
+ lea -40(%rdi), %rdi
+ jmp L(e2)
+L(gt2): ADCSBB 8(%rdx), %r9
+ mov 16(%rsi), %r11
+ lea -8(%rsi), %rsi
+ lea -8(%rdi), %rdi
+ jmp L(m2)
+
+L(b3): mov (%rsi), %rax
+ mov 8(%rsi), %r9
+ mov 16(%rsi), %r10
+ test R32(%rcx), R32(%rcx)
+ bt $0, %r8
+ jnz L(gt3)
+ lea -32(%rdi), %rdi
+ jmp L(e3)
+L(gt3): ADCSBB (%rdx), %rax
+ jmp L(m3)
+
+L(b0): mov (%rsi), %r11
+ neg R32(%r8)
+ lea -24(%rdx), %rdx
+ lea -24(%rdi), %rdi
+ lea 8(%rsi), %rsi
+ jmp L(m0)
+
+ ALIGN(8)
+L(top): mov %r11, 24(%rdi)
+ ADCSBB (%rdx), %rax
+ lea 32(%rdi), %rdi
+L(m3): mov %rax, (%rdi)
+ ADCSBB 8(%rdx), %r9
+ mov 24(%rsi), %r11
+L(m2): mov %r9, 8(%rdi)
+ ADCSBB 16(%rdx), %r10
+ lea 32(%rsi), %rsi
+L(m1): mov %r10, 16(%rdi)
+L(m0): ADCSBB 24(%rdx), %r11
+ mov (%rsi), %rax
+ mov 8(%rsi), %r9
+ lea 32(%rdx), %rdx
+ dec %rcx
+ mov 16(%rsi), %r10
+ jnz L(top)
+
+ mov %r11, 24(%rdi)
+L(e3): ADCSBB (%rdx), %rax
+ mov %rax, 32(%rdi)
+L(e2): ADCSBB 8(%rdx), %r9
+ mov %r9, 40(%rdi)
+L(e1): ADCSBB 16(%rdx), %r10
+ mov %r10, 48(%rdi)
+ mov R32(%rcx), R32(%rax) C zero rax
+ adc R32(%rax), R32(%rax)
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/aorsmul_1.asm b/gmp/mpn/x86_64/atom/aorsmul_1.asm
deleted file mode 100644
index e95315347c..0000000000
--- a/gmp/mpn/x86_64/atom/aorsmul_1.asm
+++ /dev/null
@@ -1,190 +0,0 @@
-dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bd2
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel PNR
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom 19.37 this
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%rbx')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
-
- mov (up), %rax
- lea -8(up,n_param,8), up
- lea -16(rp,n_param,8), rp
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov $1, R32(n)
- sub n_param, n
- mul v0
- mov %rax, %r11
- mov 8(up,n,8), %rax
- mov %rdx, %r10
- mul v0
- mov %rax, %r8
- mov 16(up,n,8), %rax
- jmp L(lo0)
-
-L(b10): mov $3, R32(n)
- sub n_param, n
- mul v0
- mov %rax, %r11
- mov -8(up,n,8), %rax
- mov %rdx, %r10
- mul v0
- test n, n
- jns L(cj2)
- mov %rax, %r8
- mov (up,n,8), %rax
- mov %rdx, %r9
- jmp L(lo2)
-
-L(bx1): test $2, R8(n_param)
- jnz L(b11)
-
-L(b01): mov $2, R32(n)
- sub n_param, n
- mul v0
- test n, n
- jns L(cj1)
- mov %rax, %r8
- mov (up,n,8), %rax
- mov %rdx, %r9
- mul v0
- mov %rax, %r11
- mov 8(up,n,8), %rax
- mov %rdx, %r10
- jmp L(lo1)
-
-L(b11): xor R32(n), R32(n)
- sub n_param, n
- mul v0
- mov %rax, %r8
- mov 16(up,n,8), %rax
- mov %rdx, %r9
- mul v0
- mov %rax, %r11
- mov 24(up,n,8), %rax
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mul v0
- ADDSUB %r8, -16(rp,n,8)
- mov %rax, %r8
- mov (up,n,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
-L(lo2): mul v0
- ADDSUB %r11, -8(rp,n,8)
- mov %rax, %r11
- mov 8(up,n,8), %rax
- adc %r10, %r8
- mov %rdx, %r10
- adc $0, %r9
-L(lo1): mul v0
- ADDSUB %r8, (rp,n,8)
- mov %rax, %r8
- adc %r9, %r11
- mov 16(up,n,8), %rax
- adc $0, %r10
-L(lo0): mov %rdx, %r9
- mul v0
- ADDSUB %r11, 8(rp,n,8)
- mov %rax, %r11
- adc %r10, %r8
- mov 24(up,n,8), %rax
- adc $0, %r9
-L(lo3): add $4, n
- mov %rdx, %r10
- js L(top)
-
-L(end): mul v0
- ADDSUB %r8, -16(rp,n,8)
- adc %r9, %r11
- adc $0, %r10
-L(cj2): ADDSUB %r11, -8(rp,n,8)
- adc %r10, %rax
- adc $0, %rdx
-L(cj1): ADDSUB %rax, (rp,n,8)
- mov $0, R32(%rax)
- adc %rdx, %rax
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/atom/com.asm b/gmp/mpn/x86_64/atom/com.asm
deleted file mode 100644
index 6b6460fffe..0000000000
--- a/gmp/mpn/x86_64/atom/com.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_com optimised for Intel Atom.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_com)
-include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp/mpn/x86_64/atom/copyd.asm b/gmp/mpn/x86_64/atom/copyd.asm
deleted file mode 100644
index e3092794c0..0000000000
--- a/gmp/mpn/x86_64/atom/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyd optimised for Intel Atom.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/atom/copyi.asm b/gmp/mpn/x86_64/atom/copyi.asm
deleted file mode 100644
index 00ec3c23c6..0000000000
--- a/gmp/mpn/x86_64/atom/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyi optimised for Intel Atom.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/atom/dive_1.asm b/gmp/mpn/x86_64/atom/dive_1.asm
deleted file mode 100644
index d9ba5fe6f0..0000000000
--- a/gmp/mpn/x86_64/atom/dive_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_divexact_1)
-include_mpn(`x86_64/nano/dive_1.asm')
diff --git a/gmp/mpn/x86_64/atom/gmp-mparam.h b/gmp/mpn/x86_64/atom/gmp-mparam.h
index 6816dfc362..f06dab4556 100644
--- a/gmp/mpn/x86_64/atom/gmp-mparam.h
+++ b/gmp/mpn/x86_64/atom/gmp-mparam.h
@@ -1,220 +1,76 @@
-/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file.
+/* Inte Atom gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
-or both in parallel, as here.
+/* Generated by tuneup.c, 2009-01-14, gcc 4.2 */
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-#define SHLD_SLOW 1
-#define SHRD_SLOW 1
-
-/* 1667 MHz Pineview (Atom D510) */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 15
-
-#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 118
-#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 212
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58
-
-#define SQR_BASECASE_THRESHOLD 6
-#define SQR_TOOM2_THRESHOLD 23
-#define SQR_TOOM3_THRESHOLD 49
-#define SQR_TOOM4_THRESHOLD 130
-#define SQR_TOOM6_THRESHOLD 173
-#define SQR_TOOM8_THRESHOLD 238
-
-#define MULMID_TOOM42_THRESHOLD 16
-
-#define MULMOD_BNM1_THRESHOLD 10
-#define SQRMOD_BNM1_THRESHOLD 12
-
-#define MUL_FFT_MODF_THRESHOLD 252 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 252, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
- { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \
- { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
- { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \
- { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \
- { 19, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 39, 9}, { 23,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511, 9}, { 287, 8}, { 575,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 175, 9}, { 351, 8}, \
- { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,10}, { 207, 9}, { 415,10}, { 223,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1151,10}, \
- { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \
- { 703, 8}, { 1407, 7}, { 2815,10}, { 383,11}, \
- { 207,10}, { 415,11}, { 223,10}, { 447,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 415,12}, { 223,11}, \
- { 447,10}, { 895,11}, { 479,13}, { 127,12}, \
- { 255,11}, { 511,12}, { 287,11}, { 575,10}, \
- { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \
- { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
- { 511,11}, { 1023,12}, { 575,11}, { 1151,13}, \
- { 319,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 831,13}, { 447,12}, { 895,11}, { 1791,14}, \
- { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
- { 1151,13}, { 703,12}, { 1407,14}, { 383,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,15}, \
- { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
- { 1151,14}, { 639,13}, { 1407,12}, { 2815,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1919,12}, \
- { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \
- { 1151,13}, { 2431,14}, { 1407,13}, { 2815,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 185
-#define MUL_FFT_THRESHOLD 2240
-
-#define SQR_FFT_MODF_THRESHOLD 208 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 208, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
- { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \
- { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
- { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
- { 15, 8}, { 31, 9}, { 23,10}, { 15, 9}, \
- { 39,10}, { 23,11}, { 15,10}, { 31, 9}, \
- { 63, 8}, { 127,10}, { 39, 9}, { 79, 8}, \
- { 159,10}, { 47, 8}, { 191,10}, { 55,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 71, 9}, { 143, 8}, { 287, 7}, { 575,10}, \
- { 79, 9}, { 159,11}, { 47, 9}, { 191,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \
- { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \
- { 351, 8}, { 703,10}, { 191, 9}, { 383,10}, \
- { 207,11}, { 111,10}, { 223, 9}, { 447,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
- { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
- { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \
- { 703,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415,11}, { 223,10}, { 447,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
- { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,12}, { 223,11}, { 447,10}, \
- { 895,13}, { 127,12}, { 255,11}, { 511,12}, \
- { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
- { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1023,12}, { 575,11}, { 1151,13}, { 319,12}, \
- { 703,11}, { 1407,13}, { 383,12}, { 831,13}, \
- { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1151,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 895,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1151,14}, { 639,13}, { 1407,12}, \
- { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \
- { 1791,12}, { 3583,15}, { 511,14}, { 1023,13}, \
- { 2047,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
- { 1407,13}, { 2815,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 175
-#define SQR_FFT_THRESHOLD 1600
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 34
-#define MULLO_MUL_N_THRESHOLD 4392
-
-#define DC_DIV_QR_THRESHOLD 32
-#define DC_DIVAPPR_Q_THRESHOLD 122
-#define DC_BDIV_QR_THRESHOLD 35
-#define DC_BDIV_Q_THRESHOLD 76
-
-#define INV_MULMOD_BNM1_THRESHOLD 22
-#define INV_NEWTON_THRESHOLD 163
-#define INV_APPR_THRESHOLD 134
-
-#define BINV_NEWTON_THRESHOLD 179
-#define REDC_1_TO_REDC_2_THRESHOLD 17
-#define REDC_2_TO_REDC_N_THRESHOLD 43
-
-#define MU_DIV_QR_THRESHOLD 855
-#define MU_DIVAPPR_Q_THRESHOLD 872
-#define MUPI_DIV_QR_THRESHOLD 83
-#define MU_BDIV_QR_THRESHOLD 748
-#define MU_BDIV_Q_THRESHOLD 807
-
-#define POWM_SEC_TABLE 1,16,114,452,1603
-
-#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 102
-#define HGCD_APPR_THRESHOLD 95
-#define HGCD_REDUCE_THRESHOLD 1329
-#define GCD_DC_THRESHOLD 268
-#define GCDEXT_DC_THRESHOLD 221
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 418
-#define SET_STR_PRECOMPUTE_THRESHOLD 1420
-
-#define FAC_DSC_THRESHOLD 1065
-#define FAC_ODD_THRESHOLD 0 /* always */
+#define MUL_KARATSUBA_THRESHOLD 10
+#define MUL_TOOM3_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 118
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 18
+#define SQR_TOOM3_THRESHOLD 98
+#define SQR_TOOM4_THRESHOLD 166
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 24
+#define MULLOW_MUL_N_THRESHOLD 170
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 30
+#define POWM_THRESHOLD 48
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 86
+#define GCD_DC_THRESHOLD 196
+#define GCDEXT_DC_THRESHOLD 236
+#define JACOBI_BASE_METHOD 3
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 8
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 24
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 19
+#define GET_STR_PRECOMPUTE_THRESHOLD 35
+#define SET_STR_DC_THRESHOLD 268
+#define SET_STR_PRECOMPUTE_THRESHOLD 1142
+
+#define MUL_FFT_TABLE { 272, 544, 1088, 1792, 5120, 12288, 49152, 196608, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD 240
+#define MUL_FFT_THRESHOLD 1408
+
+#define SQR_FFT_TABLE { 240, 544, 1216, 2304, 5120, 12288, 49152, 196608, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 240
+#define SQR_FFT_THRESHOLD 1408
+
+/* These tables need to be updated. */
+
+#define MUL_FFT_TABLE2 {{1,4}, {209,5}, {417,6}, {961,7}, {2177,8}, {4865,9}, {5633,8}, {6401,9}, {7681,8}, {8449,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {24065,11}, {30721,10}, {48129,11}, {63489,10}, {81409,11}, {96257,9}, {106497,10}, {107521,12}, {126977,11}, {129025,10}, {130049,9}, {130561,10}, {140289,9}, {140801,10}, {147201,11}, {161793,10}, {212481,11}, {228865,12}, {258049,11}, {457729,13}, {516097,12}, {520193,11}, {588801,12}, {651265,11}, {719873,12}, {782337,11}, {849921,12}, {916481,13}, {1040385,12}, {1439745,13}, {1564673,12}, {1830913,11}, {1832961,13}, {1835009,14}, {MP_SIZE_T_MAX, 0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {177,5}, {353,6}, {833,7}, {2177,8}, {4865,9}, {5633,8}, {6401,10}, {7169,9}, {11777,10}, {15361,9}, {19969,10}, {23553,9}, {24065,11}, {30721,10}, {48129,11}, {63489,10}, {65537,9}, {73217,8}, {73601,9}, {80129,10}, {80897,9}, {81665,11}, {96257,9}, {97793,8}, {98817,10}, {99329,12}, {126977,10}, {130049,9}, {131073,10}, {143361,9}, {144385,10}, {151041,9}, {151553,10}, {154113,9}, {154625,10}, {157697,9}, {159745,10}, {195585,9}, {196609,11}, {206849,10}, {207873,11}, {220161,10}, {222209,11}, {228865,12}, {258049,11}, {272385,10}, {274433,11}, {276481,10}, {277505,11}, {280577,10}, {282625,11}, {391169,10}, {397313,11}, {401409,10}, {423937,11}, {457729,13}, {516097,12}, {520193,11}, {588801,12}, {651265,11}, {718849,12}, {782337,11}, {845825,12}, {915457,13}, {1040385,12}, {1437697,13}, {1564673,12}, {1830913,14}, {MP_SIZE_T_MAX, 0}}
diff --git a/gmp/mpn/x86_64/atom/lshift.asm b/gmp/mpn/x86_64/atom/lshift.asm
deleted file mode 100644
index 1b37d5dccf..0000000000
--- a/gmp/mpn/x86_64/atom/lshift.asm
+++ /dev/null
@@ -1,123 +0,0 @@
-dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 4.5
-C VIA nano ?
-
-C TODO
-C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
-C larger.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
- lea -8(up,n,8), up
- lea -8(rp,n,8), rp
- shr R32(n)
- mov (up), %rax
- jnc L(evn)
-
- mov %rax, %r11
- shl R8(%rcx), %r11
- neg R8(%rcx)
- shr R8(%rcx), %rax
- test n, n
- jnz L(gt1)
- mov %r11, (rp)
- FUNC_EXIT()
- ret
-
-L(gt1): mov -8(up), %r8
- mov %r8, %r10
- shr R8(%rcx), %r8
- jmp L(lo1)
-
-L(evn): mov %rax, %r10
- neg R8(%rcx)
- shr R8(%rcx), %rax
- mov -8(up), %r9
- mov %r9, %r11
- shr R8(%rcx), %r9
- neg R8(%rcx)
- dec n
- lea 8(rp), rp
- lea -8(up), up
- jz L(end)
-
- ALIGN(8)
-L(top): shl R8(%rcx), %r10
- or %r10, %r9
- shl R8(%rcx), %r11
- neg R8(%rcx)
- mov -8(up), %r8
- mov %r8, %r10
- mov %r9, -8(rp)
- shr R8(%rcx), %r8
- lea -16(rp), rp
-L(lo1): mov -16(up), %r9
- or %r11, %r8
- mov %r9, %r11
- shr R8(%rcx), %r9
- lea -16(up), up
- neg R8(%rcx)
- mov %r8, (rp)
- dec n
- jg L(top)
-
-L(end): shl R8(%rcx), %r10
- or %r10, %r9
- shl R8(%rcx), %r11
- mov %r9, -8(rp)
- mov %r11, -16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/lshiftc.asm b/gmp/mpn/x86_64/atom/lshiftc.asm
deleted file mode 100644
index 7385f8fd44..0000000000
--- a/gmp/mpn/x86_64/atom/lshiftc.asm
+++ /dev/null
@@ -1,127 +0,0 @@
-dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 5
-C VIA nano ?
-
-C TODO
-C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5
-C times larger.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- lea -8(up,n,8), up
- lea -8(rp,n,8), rp
- shr R32(n)
- mov (up), %rax
- jnc L(evn)
-
- mov %rax, %r11
- shl R8(%rcx), %r11
- neg R8(%rcx)
- shr R8(%rcx), %rax
- test n, n
- jnz L(gt1)
- not %r11
- mov %r11, (rp)
- FUNC_EXIT()
- ret
-
-L(gt1): mov -8(up), %r8
- mov %r8, %r10
- shr R8(%rcx), %r8
- jmp L(lo1)
-
-L(evn): mov %rax, %r10
- neg R8(%rcx)
- shr R8(%rcx), %rax
- mov -8(up), %r9
- mov %r9, %r11
- shr R8(%rcx), %r9
- neg R8(%rcx)
- lea 8(rp), rp
- lea -8(up), up
- jmp L(lo0)
-
-C ALIGN(16)
-L(top): shl R8(%rcx), %r10
- or %r10, %r9
- shl R8(%rcx), %r11
- not %r9
- neg R8(%rcx)
- mov -8(up), %r8
- lea -16(rp), rp
- mov %r8, %r10
- shr R8(%rcx), %r8
- mov %r9, 8(rp)
-L(lo1): or %r11, %r8
- mov -16(up), %r9
- mov %r9, %r11
- shr R8(%rcx), %r9
- lea -16(up), up
- neg R8(%rcx)
- not %r8
- mov %r8, (rp)
-L(lo0): dec n
- jg L(top)
-
-L(end): shl R8(%rcx), %r10
- or %r10, %r9
- not %r9
- shl R8(%rcx), %r11
- not %r11
- mov %r9, -8(rp)
- mov %r11, -16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/mul_1.asm b/gmp/mpn/x86_64/atom/mul_1.asm
deleted file mode 100644
index d76a3d3b8c..0000000000
--- a/gmp/mpn/x86_64/atom/mul_1.asm
+++ /dev/null
@@ -1,143 +0,0 @@
-dnl AMD64 mpn_mul_1 optimised for Intel Atom.
-
-dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bd2
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel PNR
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom 17.3 this
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_1)
- FUNC_ENTRY(4)
- xor %r8, %r8
-L(com): mov (up), %rax
- lea -16(up,n_param,8), up
- lea -8(rp,n_param,8), rp
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): mov %r8, %r9
- test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov $2, R32(n)
- sub n_param, n
- jmp L(lo0)
-
-L(bx1): test $2, R8(n_param)
- jnz L(b11)
-
-L(b01): mov $3, R32(n)
- sub n_param, n
- mul v0
- cmp $2, n
- jnz L(lo1)
- jmp L(cj1)
-
-L(b11): mov $1, R32(n)
- sub n_param, n
- jmp L(lo3)
-
-L(b10): xor R32(n), R32(n)
- sub n_param, n
- jmp L(lo2)
-
-L(top): mul v0
- mov %r9, -24(rp,n,8)
-L(lo1): xor %r9d, %r9d
- add %rax, %r8
- mov (up,n,8), %rax
- adc %rdx, %r9
- mov %r8, -16(rp,n,8)
-L(lo0): xor %r8d, %r8d
- mul v0
- add %rax, %r9
- mov 8(up,n,8), %rax
- adc %rdx, %r8
- mov %r9, -8(rp,n,8)
-L(lo3): xor %r9d, %r9d
- mul v0
- add %rax, %r8
- mov 16(up,n,8), %rax
- adc %rdx, %r9
- mov %r8, (rp,n,8)
-L(lo2): xor %r8d, %r8d
- mul v0
- add %rax, %r9
- mov 24(up,n,8), %rax
- adc %rdx, %r8
- add $4, n
- js L(top)
-
-L(end): mul v0
- mov %r9, -8(rp)
-L(cj1): add %rax, %r8
- mov $0, R32(%rax)
- adc %rdx, %rax
- mov %r8, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
-
-PROLOGUE(mpn_mul_1c)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- jmp L(com)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/atom/mul_2.asm b/gmp/mpn/x86_64/atom/mul_2.asm
deleted file mode 100644
index f3fc3afdd1..0000000000
--- a/gmp/mpn/x86_64/atom/mul_2.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-dnl AMD64 mpn_mul_2 optimised for Intel Atom.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bd2
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel PNR
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom 17.75 this
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vp', `%rcx') C r9
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %rax
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov n_param, n
- mul v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jnz L(b10)
-
-L(b00): mov %rax, w0
- mov (up), %rax
- mov %rdx, w1
- xor R32(w2), R32(w2)
- lea -8(rp), rp
- jmp L(lo0)
-
-L(b10): mov %rax, w2
- mov (up), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- lea -16(up), up
- lea -24(rp), rp
- jmp L(lo2)
-
-L(bx1): test $2, R8(n)
- jnz L(b11)
-
-L(b01): mov %rax, w3
- mov %rdx, w0
- mov (up), %rax
- xor R32(w1), R32(w1)
- lea 8(up), up
- dec n
- jmp L(lo1)
-
-L(b11): mov %rax, w1
- mov (up), %rax
- mov %rdx, w2
- xor R32(w3), R32(w3)
- lea -8(up), up
- lea -16(rp), rp
- jmp L(lo3)
-
- ALIGN(16)
-L(top):
-L(lo1): mul v1
- add %rax, w0
- mov (up), %rax
- mov $0, R32(w2)
- mov w3, (rp)
- adc %rdx, w1
- mul v0
- add %rax, w0
- mov (up), %rax
- adc %rdx, w1
- adc $0, R32(w2)
-L(lo0): mul v1
- add %rax, w1
- mov 8(up), %rax
- mov w0, 8(rp)
- adc %rdx, w2
- mul v0
- add %rax, w1
- mov 8(up), %rax
- adc %rdx, w2
- mov $0, R32(w3)
- adc $0, R32(w3)
-L(lo3): mul v1
- add %rax, w2
- mov 16(up), %rax
- mov w1, 16(rp)
- mov $0, R32(w0)
- adc %rdx, w3
- mul v0
- add %rax, w2
- mov 16(up), %rax
- adc %rdx, w3
-L(lo2): mov $0, R32(w1)
- mov w2, 24(rp)
- adc $0, R32(w0)
- mul v1
- add %rax, w3
- mov 24(up), %rax
- lea 32(up), up
- adc %rdx, w0
- mul v0
- lea 32(rp), rp
- add %rax, w3
- adc %rdx, w0
- mov -8(up), %rax
- adc $0, R32(w1)
- sub $4, n
- ja L(top)
-
-L(end): mul v1
- mov w3, (rp)
- add %rax, w0
- adc %rdx, w1
- mov w0, 8(rp)
- mov w1, %rax
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/popcount.asm b/gmp/mpn/x86_64/atom/popcount.asm
deleted file mode 100644
index fb14dd3d31..0000000000
--- a/gmp/mpn/x86_64/atom/popcount.asm
+++ /dev/null
@@ -1,35 +0,0 @@
-dnl x86-64 mpn_popcount.
-
-dnl Copyright 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/atom/redc_1.asm b/gmp/mpn/x86_64/atom/redc_1.asm
deleted file mode 100644
index d93c19fdc0..0000000000
--- a/gmp/mpn/x86_64/atom/redc_1.asm
+++ /dev/null
@@ -1,574 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for Intel Atom.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat 5.0
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-C * Make lead-in code for the inner loops be more similar.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-define(`w0', `%rbp')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea (mp_param,n,8), mp
- lea (up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(bx0)
-
-L(bx1): test $2, R8(n)
- jz L(b3)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
-
-L(otp1):lea 1(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, %rbp
- mov 8(mp,n,8), %rax
- mov %rdx, %r9
- mul q0
- mov %rax, %rbx
- mov 16(mp,n,8), %rax
- mov %rdx, %r10
- mul q0
- add (up,n,8), %rbp
- mov %rax, %rbp
- adc %r9, %rbx
- mov 24(mp,n,8), %rax
- adc $0, %r10
- mov %rdx, %r9
- mul q0
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- mov %rax, %r11
- adc %r10, %rbp
- mov 32(mp,n,8), %rax
- adc $0, %r9
- imul u0inv, %rbx C next q limb
- jmp L(e1)
-
- ALIGNx
-L(tp1): mul q0
- add %rbp, -24(up,i,8)
- mov %rax, %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
- mul q0
- add %r11, -16(up,i,8)
- mov %rax, %r11
- mov 8(mp,i,8), %rax
- adc %r10, %rbp
- mov %rdx, %r10
- adc $0, %r9
- mul q0
- add %rbp, -8(up,i,8)
- mov %rax, %rbp
- adc %r9, %r11
- mov 16(mp,i,8), %rax
- adc $0, %r10
- mov %rdx, %r9
- mul q0
- add %r11, (up,i,8)
- mov %rax, %r11
- adc %r10, %rbp
- mov 24(mp,i,8), %rax
- adc $0, %r9
-L(e1): add $4, i
- mov %rdx, %r10
- js L(tp1)
-
-L(ed1): mul q0
- add %rbp, I(-24(up),-24(up,i,8))
- adc %r9, %r11
- adc $0, %r10
- add %r11, I(-16(up),-16(up,i,8))
- adc %r10, %rax
- adc $0, %rdx
- add %rax, I(-8(up),-8(up,i,8))
- adc $0, %rdx
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b3): cmp $-3, R32(n)
- jz L(n3)
-
-L(otp3):lea 3(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, %rbp
- mov 8(mp,n,8), %rax
- mov %rdx, %r9
- mul q0
- mov %rax, %rbx
- mov 16(mp,n,8), %rax
- mov %rdx, %r10
- mul q0
- add (up,n,8), %rbp
- mov %rax, %rbp
- mov 24(mp,n,8), %rax
- adc %r9, %rbx
- mov %rdx, %r9
- adc $0, %r10
- mul q0
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- mov %rax, %r11
- mov 32(mp,n,8), %rax
- adc %r10, %rbp
- mov %rdx, %r10
- adc $0, %r9
- imul u0inv, %rbx C next q limb
- jmp L(e3)
-
- ALIGNx
-L(tp3): mul q0
- add %rbp, -24(up,i,8)
- mov %rax, %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
- mul q0
- add %r11, -16(up,i,8)
- mov %rax, %r11
- mov 8(mp,i,8), %rax
- adc %r10, %rbp
- mov %rdx, %r10
- adc $0, %r9
-L(e3): mul q0
- add %rbp, -8(up,i,8)
- mov %rax, %rbp
- adc %r9, %r11
- mov 16(mp,i,8), %rax
- adc $0, %r10
- mov %rdx, %r9
- mul q0
- add %r11, (up,i,8)
- mov %rax, %r11
- adc %r10, %rbp
- mov 24(mp,i,8), %rax
- adc $0, %r9
- add $4, i
- mov %rdx, %r10
- js L(tp3)
-
-L(ed3): mul q0
- add %rbp, I(-24(up),-24(up,i,8))
- adc %r9, %r11
- adc $0, %r10
- add %r11, I(-16(up),-16(up,i,8))
- adc %r10, %rax
- adc $0, %rdx
- add %rax, I(-8(up),-8(up,i,8))
- adc $0, %rdx
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp3)
-C jmp L(cj)
-
-L(cj):
-IFSTD(` lea (up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea (up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(bx0): test $2, R8(n)
- jnz L(b2)
-
-L(b0): cmp $-4, R32(n)
- jz L(n4)
-
-L(otp0):lea 4(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, %r11
- mov 8(mp,n,8), %rax
- mov %rdx, %r10
- mul q0
- mov %rax, %rbx
- mov 16(mp,n,8), %rax
- mov %rdx, %r9
- mul q0
- add (up,n,8), %r11
- mov %rax, %r11
- adc %r10, %rbx
- mov 24(mp,n,8), %rax
- adc $0, %r9
- mov %rdx, %r10
- mul q0
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- mov %rax, %rbp
- mov 32(mp,n,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): mul q0
- add %rbp, -24(up,i,8)
- mov %rax, %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
-L(e0): mul q0
- add %r11, -16(up,i,8)
- mov %rax, %r11
- mov 8(mp,i,8), %rax
- adc %r10, %rbp
- mov %rdx, %r10
- adc $0, %r9
- mul q0
- add %rbp, -8(up,i,8)
- mov %rax, %rbp
- adc %r9, %r11
- mov 16(mp,i,8), %rax
- adc $0, %r10
- mov %rdx, %r9
- mul q0
- add %r11, (up,i,8)
- mov %rax, %r11
- adc %r10, %rbp
- mov 24(mp,i,8), %rax
- adc $0, %r9
- add $4, i
- mov %rdx, %r10
- js L(tp0)
-
-L(ed0): mul q0
- add %rbp, I(-24(up),-24(up,i,8))
- adc %r9, %r11
- adc $0, %r10
- add %r11, I(-16(up),-16(up,i,8))
- adc %r10, %rax
- adc $0, %rdx
- add %rax, I(-8(up),-8(up,i,8))
- adc $0, %rdx
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
- jmp L(cj)
-
-L(b2): cmp $-2, R32(n)
- jz L(n2)
-
-L(otp2):lea 2(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, %r11
- mov 8(mp,n,8), %rax
- mov %rdx, %r10
- mul q0
- mov %rax, %rbx
- mov 16(mp,n,8), %rax
- mov %rdx, %r9
- mul q0
- add (up,n,8), %r11
- mov %rax, %r11
- adc %r10, %rbx
- mov 24(mp,n,8), %rax
- adc $0, %r9
- mov %rdx, %r10
- mul q0
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- mov %rax, %rbp
- mov 32(mp,n,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
- imul u0inv, %rbx C next q limb
- jmp L(e2)
-
- ALIGNx
-L(tp2): mul q0
- add %rbp, -24(up,i,8)
- mov %rax, %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %rdx, %r9
- adc $0, %r10
- mul q0
- add %r11, -16(up,i,8)
- mov %rax, %r11
- mov 8(mp,i,8), %rax
- adc %r10, %rbp
- mov %rdx, %r10
- adc $0, %r9
- mul q0
- add %rbp, -8(up,i,8)
- mov %rax, %rbp
- adc %r9, %r11
- mov 16(mp,i,8), %rax
- adc $0, %r10
- mov %rdx, %r9
-L(e2): mul q0
- add %r11, (up,i,8)
- mov %rax, %r11
- adc %r10, %rbp
- mov 24(mp,i,8), %rax
- adc $0, %r9
- add $4, i
- mov %rdx, %r10
- js L(tp2)
-
-L(ed2): mul q0
- add %rbp, I(-24(up),-24(up,i,8))
- adc %r9, %r11
- adc $0, %r10
- add %r11, I(-16(up),-16(up,i,8))
- adc %r10, %rax
- adc $0, %rdx
- add %rax, I(-8(up),-8(up,i,8))
- adc $0, %rdx
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp2)
- jmp L(cj)
-
-L(n1): mov (mp_param), %rax
- mul q0
- add -8(up), %rax
- adc (up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov -8(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -16(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov (up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 8(up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -24(mp), %rax
- mov -24(up), %r10
- mul q0
- add %rax, %r10
- mov -16(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- add %r11, %rbp
- mov -8(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, -16(up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, -8(up)
- mov %r11, -24(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
-
- mov -48(up), %rdx
- mov -40(up), %rbx
- xor R32(%rax), R32(%rax)
- add %rbp, %rdx
- adc %r10, %rbx
- adc -8(up), %r11
- mov %rdx, (rp)
- mov %rbx, 8(rp)
- mov %r11, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n4): mov -32(mp), %rax
- mul q0
- mov %rax, %r11
- mov -24(mp), %rax
- mov %rdx, %r10
- mul q0
- mov %rax, %rbx
- mov -16(mp), %rax
- mov %rdx, %r9
- mul q0
- add -32(up), %r11
- mov %rax, %r11
- adc %r10, %rbx
- mov -8(mp), %rax
- adc $0, %r9
- mov %rdx, %r10
- mul q0
- add -24(up), %rbx
- mov %rbx, -24(up)
- adc %r9, %r11
- adc $0, %r10
- imul u0inv, %rbx C next q limb
- add %r11, -16(up)
- adc %r10, %rax
- adc $0, %rdx
- add %rax, -8(up)
- adc $0, %rdx
- mov %rdx, -32(up) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- dec j
- lea 8(up), up C up++
- jnz L(n4)
- jmp L(cj)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/atom/rsh1aors_n.asm b/gmp/mpn/x86_64/atom/rsh1aors_n.asm
deleted file mode 100644
index 6f5f6384a7..0000000000
--- a/gmp/mpn/x86_64/atom/rsh1aors_n.asm
+++ /dev/null
@@ -1,287 +0,0 @@
-dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO
-C * Schedule loop less. It is now almost surely overscheduled, resulting in
-C large feed-in and wind-down code.
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NMH ?
-C Intel SBR ?
-C Intel atom 5.25
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`vp',`%rdx')
-define(`n',`%rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), %r15
- ADDSUB (vp), %r15
- sbb R32(%rbx), R32(%rbx)
- xor R32(%rax), R32(%rax)
- shr %r15
- adc R32(%rax), R32(%rax) C return value
-
- mov R32(n), R32(%rbp)
- and $3, R32(%rbp)
- jz L(b0)
- cmp $2, R32(%rbp)
- jae L(b23)
-
-L(b1): dec n
- jnz L(gt1)
- shl $63, %rbx
- add %rbx, %r15
- mov %r15, (rp)
- jmp L(cj1)
-L(gt1): lea 24(up), up
- lea 24(vp), vp
- mov -16(up), %r9
- add R32(%rbx), R32(%rbx)
- mov -8(up), %r10
- lea 24(rp), rp
- mov (up), %r11
- ADCSBB -16(vp), %r9
- ADCSBB -8(vp), %r10
- mov %r15, %r12
- ADCSBB (vp), %r11
- mov %r9, %r13
- sbb R32(%rbx), R32(%rbx)
- mov %r11, %r15
- mov %r10, %r14
- shl $63, %r11
- shl $63, %r10
- shl $63, %r9
- or %r9, %r12
- shr %r13
- mov 8(up), %r8
- shr %r14
- or %r10, %r13
- shr %r15
- or %r11, %r14
- sub $4, n
- jz L(cj5)
-L(gt5): mov 16(up), %r9
- add R32(%rbx), R32(%rbx)
- mov 24(up), %r10
- ADCSBB 8(vp), %r8
- mov %r15, %rbp
- mov 32(up), %r11
- jmp L(lo1)
-
-L(b23): jnz L(b3)
- mov 8(up), %r8
- sub $2, n
- jnz L(gt2)
- add R32(%rbx), R32(%rbx)
- ADCSBB 8(vp), %r8
- mov %r8, %r12
- jmp L(cj2)
-L(gt2): mov 16(up), %r9
- add R32(%rbx), R32(%rbx)
- mov 24(up), %r10
- ADCSBB 8(vp), %r8
- mov %r15, %rbp
- mov 32(up), %r11
- ADCSBB 16(vp), %r9
- lea 32(up), up
- ADCSBB 24(vp), %r10
- mov %r9, %r13
- ADCSBB 32(vp), %r11
- mov %r8, %r12
- jmp L(lo2)
-
-L(b3): lea 40(up), up
- lea 8(vp), vp
- mov %r15, %r14
- add R32(%rbx), R32(%rbx)
- mov -32(up), %r11
- ADCSBB 0(vp), %r11
- lea 8(rp), rp
- sbb R32(%rbx), R32(%rbx)
- mov %r11, %r15
- shl $63, %r11
- mov -24(up), %r8
- shr %r15
- or %r11, %r14
- sub $3, n
- jnz L(gt3)
- add R32(%rbx), R32(%rbx)
- ADCSBB 8(vp), %r8
- jmp L(cj3)
-L(gt3): mov -16(up), %r9
- add R32(%rbx), R32(%rbx)
- mov -8(up), %r10
- ADCSBB 8(vp), %r8
- mov %r15, %rbp
- mov (up), %r11
- ADCSBB 16(vp), %r9
- ADCSBB 24(vp), %r10
- mov %r8, %r12
- jmp L(lo3)
-
-L(b0): lea 48(up), up
- lea 16(vp), vp
- add R32(%rbx), R32(%rbx)
- mov -40(up), %r10
- lea 16(rp), rp
- mov -32(up), %r11
- ADCSBB -8(vp), %r10
- mov %r15, %r13
- ADCSBB (vp), %r11
- sbb R32(%rbx), R32(%rbx)
- mov %r11, %r15
- mov %r10, %r14
- shl $63, %r11
- shl $63, %r10
- mov -24(up), %r8
- shr %r14
- or %r10, %r13
- shr %r15
- or %r11, %r14
- sub $4, n
- jnz L(gt4)
- add R32(%rbx), R32(%rbx)
- ADCSBB 8(vp), %r8
- jmp L(cj4)
-L(gt4): mov -16(up), %r9
- add R32(%rbx), R32(%rbx)
- mov -8(up), %r10
- ADCSBB 8(vp), %r8
- mov %r15, %rbp
- mov (up), %r11
- ADCSBB 16(vp), %r9
- jmp L(lo0)
-
- ALIGN(8)
-L(top): mov 16(up), %r9
- shr %r14
- or %r10, %r13
- shr %r15
- or %r11, %r14
- add R32(%rbx), R32(%rbx)
- mov 24(up), %r10
- mov %rbp, (rp)
- ADCSBB 8(vp), %r8
- mov %r15, %rbp
- lea 32(rp), rp
- mov 32(up), %r11
-L(lo1): ADCSBB 16(vp), %r9
- lea 32(up), up
- mov %r12, -24(rp)
-L(lo0): ADCSBB 24(vp), %r10
- mov %r8, %r12
- mov %r13, -16(rp)
-L(lo3): ADCSBB 32(vp), %r11
- mov %r9, %r13
- mov %r14, -8(rp)
-L(lo2): sbb R32(%rbx), R32(%rbx)
- shl $63, %r8
- mov %r11, %r15
- shr %r12
- mov %r10, %r14
- shl $63, %r9
- lea 32(vp), vp
- shl $63, %r10
- or %r8, %rbp
- shl $63, %r11
- or %r9, %r12
- shr %r13
- mov 8(up), %r8
- sub $4, n
- jg L(top)
-
-L(end): shr %r14
- or %r10, %r13
- shr %r15
- or %r11, %r14
- mov %rbp, (rp)
- lea 32(rp), rp
-L(cj5): add R32(%rbx), R32(%rbx)
- ADCSBB 8(vp), %r8
- mov %r12, -24(rp)
-L(cj4): mov %r13, -16(rp)
-L(cj3): mov %r8, %r12
- mov %r14, -8(rp)
-L(cj2): sbb R32(%rbx), R32(%rbx)
- shl $63, %r8
- shr %r12
- or %r8, %r15
- shl $63, %rbx
- add %rbx, %r12
- mov %r15, (rp)
- mov %r12, 8(rp)
-L(cj1): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/rshift.asm b/gmp/mpn/x86_64/atom/rshift.asm
deleted file mode 100644
index 29c027de49..0000000000
--- a/gmp/mpn/x86_64/atom/rshift.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 4.5
-C VIA nano ?
-
-C TODO
-C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
-C larger.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- shr R32(n)
- mov (up), %rax
- jnc L(evn)
-
- mov %rax, %r11
- shr R8(cnt), %r11
- neg R8(cnt)
- shl R8(cnt), %rax
- test n, n
- jnz L(gt1)
- mov %r11, (rp)
- FUNC_EXIT()
- ret
-
-L(gt1): mov 8(up), %r8
- mov %r8, %r10
- shl R8(cnt), %r8
- jmp L(lo1)
-
-L(evn): mov %rax, %r10
- neg R8(cnt)
- shl R8(cnt), %rax
- mov 8(up), %r9
- mov %r9, %r11
- shl R8(cnt), %r9
- neg R8(cnt)
- dec n
- lea -8(rp), rp
- lea 8(up), up
- jz L(end)
-
- ALIGN(8)
-L(top): shr R8(cnt), %r10
- or %r10, %r9
- shr R8(cnt), %r11
- neg R8(cnt)
- mov 8(up), %r8
- mov %r8, %r10
- mov %r9, 8(rp)
- shl R8(cnt), %r8
- lea 16(rp), rp
-L(lo1): mov 16(up), %r9
- or %r11, %r8
- mov %r9, %r11
- shl R8(cnt), %r9
- lea 16(up), up
- neg R8(cnt)
- mov %r8, (rp)
- dec n
- jg L(top)
-
-L(end): shr R8(cnt), %r10
- or %r10, %r9
- shr R8(cnt), %r11
- mov %r9, 8(rp)
- mov %r11, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/atom/sublsh1_n.asm b/gmp/mpn/x86_64/atom/sublsh1_n.asm
deleted file mode 100644
index 1306acde2b..0000000000
--- a/gmp/mpn/x86_64/atom/sublsh1_n.asm
+++ /dev/null
@@ -1,242 +0,0 @@
-dnl AMD64 mpn_sublsh1_n optimised for Intel Atom.
-dnl Used also for AMD bd1.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C TODO
-C * This code is slightly large at 501 bytes.
-C * aorrlsh1_n.asm and this file use the same basic pattern.
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bd1 2.3
-C AMD bobcat ?
-C Intel P4 ?
-C Intel core2 ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel atom 5 (4.875 is probably possible)
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sublsh1_n)
- FUNC_ENTRY(4)
- push %rbp
- push %r15
- xor R32(%rbp), R32(%rbp)
-L(ent): mov R32(n), R32(%rax)
- and $3, R32(%rax)
- jz L(b0)
- cmp $2, R32(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mov (vp), %r8
- add %r8, %r8
- lea 8(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- mov (up), %r15
- sbb %r8, %r15
- mov %r15, (rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 8(up), up
- lea 8(rp), rp
- jmp L(b0)
-
-L(b2): mov (vp), %r8
- add %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- lea 16(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- mov (up), %r15
- sbb %r8, %r15
- mov %r15, (rp)
- mov 8(up), %r15
- sbb %r9, %r15
- mov %r15, 8(rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 16(up), up
- lea 16(rp), rp
- jmp L(b0)
-
-L(b3): mov (vp), %r8
- add %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- mov 16(vp), %r10
- adc %r10, %r10
- lea 24(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- mov (up), %r15
- sbb %r8, %r15
- mov %r15, (rp)
- mov 8(up), %r15
- sbb %r9, %r15
- mov %r15, 8(rp)
- mov 16(up), %r15
- sbb %r10, %r15
- mov %r15, 16(rp)
- sbb R32(%rbp), R32(%rbp) C save acy
- lea 24(up), up
- lea 24(rp), rp
-
-L(b0): test $4, R8(n)
- jz L(skp)
- add R32(%rax), R32(%rax) C restore scy
- mov (vp), %r8
- adc %r8, %r8
- mov 8(vp), %r9
- adc %r9, %r9
- mov 16(vp), %r10
- adc %r10, %r10
- mov 24(vp), %r11
- adc %r11, %r11
- lea 32(vp), vp
- sbb R32(%rax), R32(%rax) C save scy
- add R32(%rbp), R32(%rbp) C restore acy
- mov (up), %r15
- sbb %r8, %r15
- mov %r15, (rp)
- mov 8(up), %r15
- sbb %r9, %r15
- mov %r15, 8(rp)
- mov 16(up), %r15
- sbb %r10, %r15
- mov %r15, 16(rp)
- mov 24(up), %r15
- sbb %r11, %r15
- mov %r15, 24(rp)
- lea 32(up), up
- lea 32(rp), rp
- sbb R32(%rbp), R32(%rbp) C save acy
-
-L(skp): cmp $8, n
- jl L(rtn)
-
- push %r12
- push %r13
- push %r14
- push %rbx
- lea -64(rp), rp
- jmp L(x)
-
- ALIGN(16)
-L(top): mov (vp), %r8
- add R32(%rax), R32(%rax)
- lea 64(vp), vp
- adc %r8, %r8
- mov -56(vp), %r9
- adc %r9, %r9
- mov -48(vp), %r10
- adc %r10, %r10
- mov -40(vp), %r11
- adc %r11, %r11
- mov -32(vp), %r12
- adc %r12, %r12
- mov -24(vp), %r13
- adc %r13, %r13
- mov -16(vp), %r14
- adc %r14, %r14
- mov -8(vp), %r15
- adc %r15, %r15
- sbb R32(%rax), R32(%rax)
- add R32(%rbp), R32(%rbp)
- mov (up), %rbp
- lea 64(rp), rp
- mov 8(up), %rbx
- sbb %r8, %rbp
- mov 32(up), %r8
- mov %rbp, (rp)
- sbb %r9, %rbx
- mov 16(up), %rbp
- mov %rbx, 8(rp)
- sbb %r10, %rbp
- mov 24(up), %rbx
- mov %rbp, 16(rp)
- sbb %r11, %rbx
- mov %rbx, 24(rp)
- sbb %r12, %r8
- mov 40(up), %r9
- mov %r8, 32(rp)
- sbb %r13, %r9
- mov 48(up), %rbp
- mov %r9, 40(rp)
- sbb %r14, %rbp
- mov 56(up), %rbx
- mov %rbp, 48(rp)
- sbb %r15, %rbx
- lea 64(up), up
- mov %rbx, 56(rp)
- sbb R32(%rbp), R32(%rbp)
-L(x): sub $8, n
- jge L(top)
-
-L(end): pop %rbx
- pop %r14
- pop %r13
- pop %r12
-L(rtn):
- add R32(%rbp), R32(%rax)
- neg R32(%rax)
-
- pop %r15
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
-PROLOGUE(mpn_sublsh1_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbp
- push %r15
- neg %r8 C set CF
- sbb R32(%rbp), R32(%rbp) C save acy
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bd1/README b/gmp/mpn/x86_64/bd1/README
deleted file mode 100644
index ccd210e0d6..0000000000
--- a/gmp/mpn/x86_64/bd1/README
+++ /dev/null
@@ -1,11 +0,0 @@
-This directory contains code for AMD bulldozer including its piledriver update.
-
-We currently make limited use of SIMD instructions, both via the MPN_PATH and
-via inclusion of x86_64/fastsse files.
-
-The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably
-means that an all-core GMP load (such as a HPC load) might run slower if there
-is significant SIMD dependency.
-
-We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
-SIMD code.
diff --git a/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm
deleted file mode 100644
index c34a5fa134..0000000000
--- a/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
-include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp/mpn/x86_64/bd1/aorsmul_1.asm b/gmp/mpn/x86_64/bd1/aorsmul_1.asm
deleted file mode 100644
index 96fec9f5ac..0000000000
--- a/gmp/mpn/x86_64/bd1/aorsmul_1.asm
+++ /dev/null
@@ -1,181 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bd1 4.5-4.7
-C AMD bobcat
-C Intel P4
-C Intel core2
-C Intel NHM
-C Intel SBR
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C TODO
-C * Try to make loop run closer to 4 c/l.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%r11')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`v0', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax C read first u limb early
- push %rbx
-IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
-IFDOS(` mov n, %rbx ')
- mul v0
-
-IFSTD(` mov %rbx, n ')
-
- and $3, R32(%rbx)
- lea -16(rp,n,8), rp
- jz L(b0)
- cmp $2, R32(%rbx)
- jb L(b1)
- jz L(b2)
-
-L(b3): mov $0, R32(%r8)
- mov %rax, %rbx
- mov $0, R32(%r9)
- mov 8(up), %rax
- mov %rdx, %r10
- lea (up,n,8), up
- not n
- jmp L(L3)
-
-L(b0): mov $0, R32(%r10)
- mov %rax, %r8
- mov %rdx, %rbx
- mov 8(up), %rax
- lea (up,n,8), up
- neg n
- jmp L(L0)
-
-L(b1): cmp $1, n
- jz L(n1)
- mov %rax, %r9
- mov 8(up), %rax
- mov %rdx, %r8
- mov $0, R32(%rbx)
- lea (up,n,8), up
- neg n
- inc n
- jmp L(L1)
-
-L(b2): mov $0, R32(%rbx)
- mov %rax, %r10
- mov %rdx, %r9
- mov 8(up), %rax
- mov $0, R32(%r8)
- lea (up,n,8), up
- neg n
- add $2, n
- jns L(end)
-
- ALIGN(32)
-L(top): mul v0
- ADDSUB %r10, (rp,n,8)
- adc %rax, %r9
- mov (up,n,8), %rax
- adc %rdx, %r8
-L(L1): mul v0
- mov $0, R32(%r10)
- ADDSUB %r9, 8(rp,n,8)
- adc %rax, %r8
- adc %rdx, %rbx
- mov 8(up,n,8), %rax
-L(L0): mul v0
- ADDSUB %r8, 16(rp,n,8)
- mov $0, R32(%r8)
- adc %rax, %rbx
- mov $0, R32(%r9)
- mov 16(up,n,8), %rax
- adc %rdx, %r10
-L(L3): mul v0
- ADDSUB %rbx, 24(rp,n,8)
- mov $0, R32(%rbx)
- adc %rax, %r10
- adc %rdx, %r9
- mov 24(up,n,8), %rax
- add $4, n
- js L(top)
-
-L(end): mul v0
- ADDSUB %r10, (rp)
- adc %r9, %rax
- adc %r8, %rdx
-L(n1): ADDSUB %rax, 8(rp)
- adc $0, %rdx
- mov %rdx, %rax
-
- pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/bd1/com.asm b/gmp/mpn/x86_64/bd1/com.asm
deleted file mode 100644
index 43f356117a..0000000000
--- a/gmp/mpn/x86_64/bd1/com.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_com optimised for AMD bd1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_com)
-include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/copyd.asm b/gmp/mpn/x86_64/bd1/copyd.asm
deleted file mode 100644
index 675cdc3f6b..0000000000
--- a/gmp/mpn/x86_64/bd1/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyd optimised for AMD bd1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/copyi.asm b/gmp/mpn/x86_64/bd1/copyi.asm
deleted file mode 100644
index ceef036585..0000000000
--- a/gmp/mpn/x86_64/bd1/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyi optimised for AMD bd1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/bd1/gcd_1.asm b/gmp/mpn/x86_64/bd1/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab1..0000000000
--- a/gmp/mpn/x86_64/bd1/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/gmp/mpn/x86_64/bd1/gmp-mparam.h b/gmp/mpn/x86_64/bd1/gmp-mparam.h
deleted file mode 100644
index 5014f9f469..0000000000
--- a/gmp/mpn/x86_64/bd1/gmp-mparam.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 3600 MHz Bulldozer Zambezi */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 22
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 59
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 115
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 150
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 22
-#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 242
-#define SQR_TOOM6_THRESHOLD 318
-#define SQR_TOOM8_THRESHOLD 478
-
-#define MULMID_TOOM42_THRESHOLD 22
-
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 14
-
-#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 404, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
- { 19, 7}, { 10, 6}, { 25, 7}, { 15, 6}, \
- { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
- { 135,11}, { 79,10}, { 167,11}, { 95,10}, \
- { 191,11}, { 111,12}, { 63,11}, { 127,10}, \
- { 255,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 303,11}, { 159,12}, { 95,11}, { 191,10}, \
- { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
- { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
- { 319, 9}, { 1279,11}, { 367,12}, { 191,11}, \
- { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
- { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
- { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \
- { 319,10}, { 1279,11}, { 671,12}, { 351,11}, \
- { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \
- { 895,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \
- { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
- { 319,11}, { 1279,12}, { 671,11}, { 1343,10}, \
- { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
- { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
- { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
- { 2431,10}, { 4863,12}, { 1343,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
- { 831,12}, { 1727,11}, { 3455,13}, { 895,12}, \
- { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \
- { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \
- { 1215,12}, { 2431,11}, { 4863,13}, { 1343,12}, \
- { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \
- { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \
- { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \
- { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
- { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \
- { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
- { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 217
-#define MUL_FFT_THRESHOLD 3712
-
-#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 380, 5}, { 17, 6}, { 9, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 135,11}, { 79,10}, { 159,11}, { 95,10}, \
- { 191,11}, { 111,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
- { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
- { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543, 9}, { 1087,11}, { 303,10}, \
- { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671, 9}, { 1343,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
- { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \
- { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
- { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
- { 1407,13}, { 383,12}, { 767,11}, { 1599,10}, \
- { 3199,12}, { 831,13}, { 447,12}, { 895,14}, \
- { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
- { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \
- { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1599,11}, \
- { 3199,13}, { 831,12}, { 1727,13}, { 895,15}, \
- { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
- { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \
- { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \
- { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \
- { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,13}, { 2943,12}, { 5887,15}, { 767,14}, \
- { 1535,13}, { 3199,14}, { 1663,13}, { 3327,12}, \
- { 6655,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,14}, { 1919,13}, { 3839,16}, { 511,15}, \
- { 1023,14}, { 2175,13}, { 4351,14}, { 2303,13}, \
- { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 220
-#define SQR_FFT_THRESHOLD 3264
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 39
-#define MULLO_MUL_N_THRESHOLD 7246
-
-#define DC_DIV_QR_THRESHOLD 54
-#define DC_DIVAPPR_Q_THRESHOLD 180
-#define DC_BDIV_QR_THRESHOLD 47
-#define DC_BDIV_Q_THRESHOLD 80
-
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 226
-#define INV_APPR_THRESHOLD 188
-
-#define BINV_NEWTON_THRESHOLD 248
-#define REDC_1_TO_REDC_2_THRESHOLD 52
-#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1360
-#define MUPI_DIV_QR_THRESHOLD 108
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1360
-
-#define POWM_SEC_TABLE 1,16,194,386,452,2245
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 108
-#define HGCD_APPR_THRESHOLD 51
-#define HGCD_REDUCE_THRESHOLD 2681
-#define GCD_DC_THRESHOLD 474
-#define GCDEXT_DC_THRESHOLD 298
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 418
-#define SET_STR_PRECOMPUTE_THRESHOLD 1289
-
-#define FAC_DSC_THRESHOLD 252
-#define FAC_ODD_THRESHOLD 23
diff --git a/gmp/mpn/x86_64/bd1/hamdist.asm b/gmp/mpn/x86_64/bd1/hamdist.asm
deleted file mode 100644
index 93e1e5632b..0000000000
--- a/gmp/mpn/x86_64/bd1/hamdist.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_hamdist -- hamming distance.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86_64/k10/hamdist.asm')
diff --git a/gmp/mpn/x86_64/bd1/mul_1.asm b/gmp/mpn/x86_64/bd1/mul_1.asm
deleted file mode 100644
index e59667c085..0000000000
--- a/gmp/mpn/x86_64/bd1/mul_1.asm
+++ /dev/null
@@ -1,184 +0,0 @@
-dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bd1 4
-C AMD bobcat
-C Intel P4
-C Intel core2
-C Intel NHM
-C Intel SBR
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C TODO
-C * Move loop code into feed-in blocks, to save insn for zeroing regs.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%rbx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`v0', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``rbx'') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_1c)
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax C read first u limb early
- push %rbx
-IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
-IFDOS(` mov n, %r11 ')
- mul v0
-
-IFSTD(` add %r8, %rax ')
-IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns)
- adc $0, %rdx
- jmp L(common)
-
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mul_1)
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax C read first u limb early
- push %rbx
-IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
-IFDOS(` mov n, %r11 ')
- mul v0
-
-L(common):
-IFSTD(` mov %r11, n ')
-
- and $3, R32(%r11)
- lea -16(rp,n,8), rp
- jz L(b0)
- cmp $2, R32(%r11)
- jb L(b1)
- jz L(b2)
-
-L(b3): mov %rax, %r10
- mov %rdx, %r11
- mov 8(up), %rax
- mul v0
- lea (up,n,8), up
- not n
- jmp L(L3)
-
-L(b0): mov %rax, %r9
- mov %rdx, %r10
- mov 8(up), %rax
- lea (up,n,8), up
- neg n
- jmp L(L0)
-
-L(b1): mov %rax, %r8
- cmp $1, n
- jz L(n1)
- mov %rdx, %r9
- lea (up,n,8), up
- neg n
- mov %r8, 16(rp,n,8)
- inc n
- jmp L(L1)
-
-L(b2): mov %rax, %r11
- mov %rdx, %r8
- mov 8(up), %rax
- lea (up,n,8), up
- neg n
- add $2, n
- jns L(end)
-
- ALIGN(16)
-L(top): mul v0
- mov %rdx, %r9
- add %rax, %r8
- adc $0, %r9
- mov %r8, 8(rp,n,8)
- mov %r11, (rp,n,8)
-L(L1): mov (up,n,8), %rax
- mul v0
- add %rax, %r9
- mov %rdx, %r10
- mov 8(up,n,8), %rax
- adc $0, %r10
-L(L0): mul v0
- add %rax, %r10
- mov %rdx, %r11
- mov 16(up,n,8), %rax
- adc $0, %r11
- mul v0
- mov %r9, 16(rp,n,8)
-L(L3): add %rax, %r11
- mov %r10, 24(rp,n,8)
- mov %rdx, %r8
- adc $0, %r8
- add $4, n
- mov -8(up,n,8), %rax
- js L(top)
-
-L(end): mul v0
- add %rax, %r8
- adc $0, %rdx
- mov %r11, (rp)
-L(n1): mov %r8, 8(rp)
- mov %rdx, %rax
-
- pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/bd1/mul_2.asm b/gmp/mpn/x86_64/bd1/mul_2.asm
deleted file mode 100644
index 4ed5f30561..0000000000
--- a/gmp/mpn/x86_64/bd1/mul_2.asm
+++ /dev/null
@@ -1,192 +0,0 @@
-dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bull 4.36 average, quite fluctuating
-C AMD pile 4.38 slighty fluctuating
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-C Scheme: genxmul --mul
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vp', `%rcx') C r9
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %rax
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea (up,n_param,8), up
- lea (rp,n_param,8), rp
-
- mov n_param, n
- mul v0
- neg n
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jnz L(b10)
-
-L(b00): mov %rax, w0
- mov %rdx, w1
- xor R32(w2), R32(w2)
- mov (up,n,8), %rax
- jmp L(lo0)
-
-L(b10): mov %rax, w2
- mov %rdx, w3
- mov (up,n,8), %rax
- xor R32(w0), R32(w0)
- mul v1
- add $-2, n
- jmp L(lo2)
-
-L(bx1): test $2, R8(n)
- jz L(b11)
-
-L(b01): mov %rax, w3
- mov %rdx, w0
- mov (up,n,8), %rax
- mul v1
- xor R32(w1), R32(w1)
- inc n
- jmp L(lo1)
-
-L(b11): mov %rax, w1
- mov %rdx, w2
- mov (up,n,8), %rax
- xor R32(w3), R32(w3)
- dec n
- jmp L(lo3)
-
- ALIGN(32)
-L(top): mov -8(up,n,8), %rax
- mul v1
- mov w2, -16(rp,n,8)
-L(lo1): add %rax, w0
- mov w3, -8(rp,n,8)
- adc %rdx, w1
- mov (up,n,8), %rax
- mul v0
- mov $0, R32(w2)
- add %rax, w0
- adc %rdx, w1
- adc $0, R32(w2)
- mov (up,n,8), %rax
-L(lo0): mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,n,8), %rax
- mul v0
- add %rax, w1
- mov w0, (rp,n,8)
- mov $0, R32(w3)
- mov 8(up,n,8), %rax
- adc %rdx, w2
- adc $0, R32(w3)
-L(lo3): mul v1
- add %rax, w2
- mov 16(up,n,8), %rax
- adc %rdx, w3
- mul v0
- add %rax, w2
- mov 16(up,n,8), %rax
- mov $0, R32(w0)
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov w1, 8(rp,n,8)
-L(lo2): add %rax, w3
- adc %rdx, w0
- mov 24(up,n,8), %rax
- mul v0
- add %rax, w3
- adc %rdx, w0
- mov $0, R32(w1)
- adc $0, R32(w1)
- add $4, n
- jnc L(top)
-
-L(end): mov -8(up,n,8), %rax
- mul v1
- mov w2, -16(rp,n,8)
- add %rax, w0
- mov w3, -8(rp,n,8)
- adc %rdx, w1
- mov w0, (rp,n,8)
- mov w1, %rax
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bd1/mul_basecase.asm b/gmp/mpn/x86_64/bd1/mul_basecase.asm
deleted file mode 100644
index e47ba587cd..0000000000
--- a/gmp/mpn/x86_64/bd1/mul_basecase.asm
+++ /dev/null
@@ -1,416 +0,0 @@
-dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull ~4.8 ~4.55 - ~4.3
-C AMD pile ~4.6 ~4.55 - ~4.55
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C TODO
-C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
-C Alternatively, we could tweak the present code (which was loopmixed for a
-C different CPU).
-C * Merge faster mul_2, such as the one in the same directory as this file.
-C * Further micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-define(`vp', `%rcx')
-define(`vn', `%r8')
-
-define(`un', `%rbx')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`n', `%rbp')
-define(`v0', `%r9')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %rbx
- push %rbp
- mov un_param, un C free up rdx
- neg un
-
- mov (up), %rax C shared for mul_1 and mul_2
- lea (up,un_param,8), up C point at operand end
- lea (rp,un_param,8), rp C point at rp[un-1]
-
- mov (vp), v0 C shared for mul_1 and mul_2
- mul v0 C shared for mul_1 and mul_2
-
- test $1, R8(vn)
- jz L(do_mul_2)
-
-L(do_mul_1):
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
- mov %rdx, w1
- mov 8(up,un,8), %rax
- test $2, R8(un)
- jnz L(m110)
-
-L(m100):lea 2(un), n C un = 4, 8, 12, ...
- jmp L(m1l0)
-
-L(m110):lea (un), n C un = 2, 6, 10, ...
- jmp L(m1l2)
-
-L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
- mov %rdx, w0
- test $2, R8(un)
- jz L(m111)
-
-L(m101):lea 3(un), n C un = 1, 5, 9, ...
- test n, n
- js L(m1l1)
- mov %rax, -8(rp)
- mov %rdx, (rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(m111):lea 1(un), n C un = 3, 7, 11, ...
- mov 8(up,un,8), %rax
- jmp L(m1l3)
-
- ALIGN(16)
-L(m1tp):mov %rdx, w0
- add %rax, w1
-L(m1l1):mov -16(up,n,8), %rax
- adc $0, w0
- mul v0
- add %rax, w0
- mov w1, -24(rp,n,8)
- mov -8(up,n,8), %rax
- mov %rdx, w1
- adc $0, w1
-L(m1l0):mul v0
- mov w0, -16(rp,n,8)
- add %rax, w1
- mov %rdx, w0
- mov (up,n,8), %rax
- adc $0, w0
-L(m1l3):mul v0
- mov w1, -8(rp,n,8)
- mov %rdx, w1
- add %rax, w0
- mov 8(up,n,8), %rax
- adc $0, w1
-L(m1l2):mul v0
- mov w0, (rp,n,8)
- add $4, n
- jnc L(m1tp)
-
-L(m1ed):add %rax, w1
- adc $0, %rdx
- mov w1, I(-8(rp),-24(rp,n,8))
- mov %rdx, I((rp),-16(rp,n,8))
-
- dec R32(vn)
- jz L(ret2)
-
- lea 8(vp), vp
- lea 8(rp), rp
- push %r12
- push %r13
- push %r14
- jmp L(do_addmul)
-
-L(do_mul_2):
-define(`v1', `%r14')
- push %r12
- push %r13
- push %r14
-
- mov 8(vp), v1
-
- test $1, R8(un)
- jnz L(m2b1)
-
-L(m2b0):lea (un), n
- mov %rax, w2 C 0
- mov (up,un,8), %rax
- mov %rdx, w1 C 1
- mul v1
- mov %rax, w0 C 1
- mov w2, (rp,un,8) C 0
- mov 8(up,un,8), %rax
- mov %rdx, w2 C 2
- jmp L(m2l0)
-
-L(m2b1):lea 1(un), n
- mov %rax, w0 C 1
- mov %rdx, w3 C 2
- mov (up,un,8), %rax
- mul v1
- mov w0, (rp,un,8) C 1
- mov %rdx, w0 C 3
- mov %rax, w2 C 0
- mov 8(up,un,8), %rax
- jmp L(m2l1)
-
- ALIGN(32)
-L(m2tp):add %rax, w2 C 0
- mov (up,n,8), %rax
- adc $0, w0 C 1
-L(m2l1):mul v0
- add %rax, w2 C 0
- mov (up,n,8), %rax
- mov %rdx, w1 C 1
- adc $0, w1 C 1
- mul v1
- add w3, w2 C 0
- adc $0, w1 C 1
- add %rax, w0 C 1
- mov w2, (rp,n,8) C 0
- mov 8(up,n,8), %rax
- mov %rdx, w2 C 2
- adc $0, w2 C 2
-L(m2l0):mul v0
- add %rax, w0 C 1
- mov %rdx, w3 C 2
- adc $0, w3 C 2
- add w1, w0 C 1
- adc $0, w3 C 2
- mov 8(up,n,8), %rax
- mul v1
- add $2, n
- mov w0, -8(rp,n,8) C 1
- mov %rdx, w0 C 3
- jnc L(m2tp)
-
-L(m2ed):add %rax, w2
- adc $0, %rdx
- add w3, w2
- adc $0, %rdx
- mov w2, I((rp),(rp,n,8))
- mov %rdx, I(8(rp),8(rp,n,8))
-
- add $-2, R32(vn)
- jz L(ret5)
-
- lea 16(vp), vp
- lea 16(rp), rp
-
-
-L(do_addmul):
- push %r15
- push vn C save vn in new stack slot
-define(`vn', `(%rsp)')
-define(`X0', `%r14')
-define(`X1', `%r15')
-define(`v1', `%r8')
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
-
- mov (up,un,8), %rax
- mul v0
-
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): mov %rax, X1
- mov (up,un,8), %rax
- mov %rdx, X0
- mul v1
- test $2, R8(un)
- jnz L(b10)
-
-L(b00): lea (un), n C un = 4, 8, 12, ...
- mov (rp,un,8), w3
- mov %rax, w0
- mov 8(up,un,8), %rax
- mov %rdx, w1
- jmp L(lo0)
-
-L(b10): lea 2(un), n C un = 2, 6, 10, ...
- mov (rp,un,8), w1
- mov %rdx, w3
- mov %rax, w2
- mov 8(up,un,8), %rax
- jmp L(lo2)
-
-L(bx1): mov %rax, X0
- mov (up,un,8), %rax
- mov %rdx, X1
- mul v1
- test $2, R8(un)
- jz L(b11)
-
-L(b01): lea 1(un), n C un = 1, 5, 9, ...
- mov (rp,un,8), w2
- mov %rdx, w0
- mov %rax, w3
- jmp L(lo1)
-
-L(b11): lea -1(un), n C un = 3, 7, 11, ...
- mov (rp,un,8), w0
- mov %rax, w1
- mov 8(up,un,8), %rax
- mov %rdx, w2
- jmp L(lo3)
-
- ALIGN(32)
-L(top):
-L(lo2): mul v0
- add w1, X1
- mov X1, -16(rp,n,8)
- mov %rdx, X1
- adc %rax, X0
- adc $0, X1
- mov -8(up,n,8), %rax
- mul v1
- mov -8(rp,n,8), w1
- mov %rdx, w0
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo1): mov (up,n,8), %rax
- mul v0
- add w2, X0
- mov X0, -8(rp,n,8)
- mov %rdx, X0
- adc %rax, X1
- mov (up,n,8), %rax
- adc $0, X0
- mov (rp,n,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- mov 8(up,n,8), %rax
- mov %rdx, w1
- adc $0, w1
-L(lo0): mul v0
- add w3, X1
- mov X1, (rp,n,8)
- adc %rax, X0
- mov 8(up,n,8), %rax
- mov %rdx, X1
- adc $0, X1
- mov 8(rp,n,8), w3
- mul v1
- add w3, w0
- adc %rax, w1
- mov 16(up,n,8), %rax
- mov %rdx, w2
- adc $0, w2
-L(lo3): mul v0
- add w0, X0
- mov X0, 8(rp,n,8)
- mov %rdx, X0
- adc %rax, X1
- adc $0, X0
- mov 16(up,n,8), %rax
- mov 16(rp,n,8), w0
- mul v1
- mov %rdx, w3
- add w0, w1
- adc %rax, w2
- adc $0, w3
- mov 24(up,n,8), %rax
- add $4, n
- jnc L(top)
-
-L(end): mul v0
- add w1, X1
- mov X1, I(-16(rp),-16(rp,n,8))
- mov %rdx, X1
- adc %rax, X0
- adc $0, X1
- mov I(-8(up),-8(up,n,8)), %rax
- mul v1
- mov I(-8(rp),-8(rp,n,8)), w1
- add w1, w2
- adc %rax, w3
- adc $0, %rdx
- add w2, X0
- adc $0, X1
- mov X0, I(-8(rp),-8(rp,n,8))
- add w3, X1
- mov X1, I((rp),(rp,n,8))
- adc $0, %rdx
- mov %rdx, I(8(rp),8(rp,n,8))
-
-
- addl $-2, vn
- lea 16(vp), vp
- lea 16(rp), rp
- jnz L(outer)
-
- pop %rax C deallocate vn slot
- pop %r15
-L(ret5):pop %r14
- pop %r13
- pop %r12
-L(ret2):pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bd1/popcount.asm b/gmp/mpn/x86_64/bd1/popcount.asm
deleted file mode 100644
index 8f22a715b6..0000000000
--- a/gmp/mpn/x86_64/bd1/popcount.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_popcount -- population count.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86_64/k10/popcount.asm')
diff --git a/gmp/mpn/x86_64/bd1/sec_tabselect.asm b/gmp/mpn/x86_64/bd1/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/bd1/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/bd1/sublsh1_n.asm b/gmp/mpn/x86_64/bd1/sublsh1_n.asm
deleted file mode 100644
index 4ba673d15a..0000000000
--- a/gmp/mpn/x86_64/bd1/sublsh1_n.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_sublsh1_n
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
-include_mpn(`x86_64/atom/sublsh1_n.asm')
diff --git a/gmp/mpn/x86_64/bd2/gmp-mparam.h b/gmp/mpn/x86_64/bd2/gmp-mparam.h
deleted file mode 100644
index 16f25c4c7b..0000000000
--- a/gmp/mpn/x86_64/bd2/gmp-mparam.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 3200 MHz Piledriver Vishera */
-/* FFT tuning limit = 40000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.8 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 2
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 21
-
-#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 54
-#define MUL_TOOM44_THRESHOLD 154
-#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 454
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 22
-#define SQR_TOOM3_THRESHOLD 81
-#define SQR_TOOM4_THRESHOLD 218
-#define SQR_TOOM6_THRESHOLD 303
-#define SQR_TOOM8_THRESHOLD 430
-
-#define MULMID_TOOM42_THRESHOLD 20
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 99,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 135, 6}, { 2175, 7}, { 1151, 9}, { 303,10}, \
- { 159, 9}, { 319, 8}, { 639, 9}, { 335,11}, \
- { 95,10}, { 191,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 303,11}, \
- { 159,12}, { 95,11}, { 191,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \
- { 319,11}, { 639,10}, { 1279,12}, { 351,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,10}, \
- { 1663,12}, { 447,14}, { 127,13}, { 255,12}, \
- { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
- { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,10}, { 2687,12}, { 703,11}, { 1407,10}, \
- { 2815,12}, { 735,13}, { 383,12}, { 767,11}, \
- { 1599,12}, { 831,11}, { 1663,13}, { 447,12}, \
- { 895,11}, { 1791,14}, { 255,13}, { 511,12}, \
- { 1023,11}, { 2047,12}, { 1087,11}, { 2175,13}, \
- { 575,12}, { 1151,11}, { 2303,12}, { 1215,11}, \
- { 2431,10}, { 4863,13}, { 639,12}, { 1279,11}, \
- { 2559,12}, { 1343,11}, { 2687,13}, { 703,12}, \
- { 1407,11}, { 2815,14}, { 383,13}, { 767,12}, \
- { 1599,13}, { 831,12}, { 1663,13}, { 895,12}, \
- { 1791,15}, { 255,14}, { 511,13}, { 1023,12}, \
- { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
- { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
- { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
- { 2815,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \
- { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3839,11}, { 7679,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
- { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \
- { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \
- { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,14}, { 1919,13}, { 3839,12}, { 7679,16}, \
- { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
- { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 225
-#define MUL_FFT_THRESHOLD 3712
-
-#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 344, 5}, { 11, 4}, { 23, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 17, 6}, \
- { 35, 8}, { 9, 7}, { 21, 8}, { 11, 7}, \
- { 25, 8}, { 13, 7}, { 27, 8}, { 15, 7}, \
- { 31, 8}, { 17, 7}, { 35, 8}, { 21, 9}, \
- { 11, 8}, { 27, 9}, { 15, 8}, { 35, 9}, \
- { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \
- { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \
- { 51,11}, { 15,10}, { 31, 9}, { 63,10}, \
- { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 135,11}, { 95,10}, { 191, 6}, { 3199, 7}, \
- { 1727, 9}, { 447,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \
- { 575,10}, { 303, 9}, { 607,10}, { 319,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
- { 351,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,10}, { 895,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
- { 543,12}, { 287,11}, { 575,12}, { 319,10}, \
- { 1279,12}, { 351,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \
- { 575,13}, { 319,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
- { 831,11}, { 1663,13}, { 447,12}, { 895,14}, \
- { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
- { 575,11}, { 2303,12}, { 1215,11}, { 2431,10}, \
- { 4863,13}, { 639,12}, { 1343,11}, { 2687,13}, \
- { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
- { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \
- { 895,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2239,11}, { 4479,12}, { 2303,13}, { 1215,12}, \
- { 2431,11}, { 4863,14}, { 639,13}, { 1279,12}, \
- { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
- { 2815,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
- { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \
- { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
- { 3839,15}, { 511,14}, { 1023,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \
- { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
- { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \
- { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \
- { 4479,14}, { 2303,13}, { 4607,14}, { 2431,13}, \
- { 4863,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 214
-#define SQR_FFT_THRESHOLD 3264
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 37
-#define MULLO_MUL_N_THRESHOLD 8397
-
-#define DC_DIV_QR_THRESHOLD 42
-#define DC_DIVAPPR_Q_THRESHOLD 173
-#define DC_BDIV_QR_THRESHOLD 42
-#define DC_BDIV_Q_THRESHOLD 77
-
-#define INV_MULMOD_BNM1_THRESHOLD 30
-#define INV_NEWTON_THRESHOLD 202
-#define INV_APPR_THRESHOLD 172
-
-#define BINV_NEWTON_THRESHOLD 238
-#define REDC_1_TO_REDC_2_THRESHOLD 44
-#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
-
-#define MU_DIV_QR_THRESHOLD 1308
-#define MU_DIVAPPR_Q_THRESHOLD 1334
-#define MUPI_DIV_QR_THRESHOLD 85
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1308
-
-#define POWM_SEC_TABLE 1,16,257,452,1099,2079
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 110
-#define HGCD_APPR_THRESHOLD 96
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 372
-#define GCDEXT_DC_THRESHOLD 293
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 204
-#define SET_STR_PRECOMPUTE_THRESHOLD 1160
-
-#define FAC_DSC_THRESHOLD 166
-#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/x86_64/bdiv_dbm1c.asm b/gmp/mpn/x86_64/bdiv_dbm1c.asm
index a53bd52beb..8d73b9fe00 100644
--- a/gmp/mpn/x86_64/bdiv_dbm1c.asm
+++ b/gmp/mpn/x86_64/bdiv_dbm1c.asm
@@ -1,106 +1,98 @@
dnl x86_64 mpn_bdiv_dbm1.
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.25
-C AMD K10 2.25
-C Intel P4 12.5
-C Intel core2 4
-C Intel NHM 3.75
-C Intel SBR 3.6
-C Intel atom 20
-C VIA nano 4
+C K8,K9: 2.25
+C K10: ?
+C P4: 12.5
+C P6-15 (Core2): 4.0
+C P6-28 (Atom): 20
C TODO
-C * Optimise feed-in code.
-
-C INPUT PARAMETERS
-define(`qp', `%rdi')
-define(`up', `%rsi')
-define(`n_param', `%rdx')
-define(`bd', `%rcx')
-define(`cy', `%r8')
+C * Do proper 4-way feed-in instead of the current epilogue
-define(`n', `%r9')
+C INPUT PARAMETERS shared
+define(`qp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`bd', `%rcx')
+define(`cy', `%r8')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_bdiv_dbm1c)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- mov (up), %rax
- mov n_param, n
- mov R32(n_param), R32(%r11)
- mul bd
- lea (up,n,8), up
- lea (qp,n,8), qp
- neg n
- and $3, R32(%r11)
- jz L(lo0)
- lea -4(n,%r11), n
- cmp $2, R32(%r11)
- jc L(lo1)
- jz L(lo2)
- jmp L(lo3)
+ mov (%rsi), %rax
+ mov %rdx, %r9 C n
+ mul %rcx
+ sub %rax, %r8
+ mov %r8, (%rdi)
+ sbb %rdx, %r8
+
+ lea (%rsi,%r9,8), %rsi
+ lea (%rdi,%r9,8), %rdi
+ neg %r9
+ add $4, %r9
+ jns L(end)
ALIGN(16)
-L(top): mov (up,n,8), %rax
- mul bd
-L(lo0): sub %rax, %r8
- mov %r8, (qp,n,8)
+L(top):
+ mov -24(%rsi,%r9,8), %rax
+ mul %rcx
+ sub %rax, %r8
+ mov %r8, -24(%rdi,%r9,8)
sbb %rdx, %r8
- mov 8(up,n,8), %rax
- mul bd
-L(lo3): sub %rax, %r8
- mov %r8, 8(qp,n,8)
+L(3):
+ mov -16(%rsi,%r9,8), %rax
+ mul %rcx
+ sub %rax, %r8
+ mov %r8, -16(%rdi,%r9,8)
sbb %rdx, %r8
- mov 16(up,n,8), %rax
- mul bd
-L(lo2): sub %rax, %r8
- mov %r8, 16(qp,n,8)
+L(2):
+ mov -8(%rsi,%r9,8), %rax
+ mul %rcx
+ sub %rax, %r8
+ mov %r8, -8(%rdi,%r9,8)
sbb %rdx, %r8
- mov 24(up,n,8), %rax
- mul bd
-L(lo1): sub %rax, %r8
- mov %r8, 24(qp,n,8)
+L(1):
+ mov (%rsi,%r9,8), %rax
+ mul %rcx
+ sub %rax, %r8
+ mov %r8, (%rdi,%r9,8)
sbb %rdx, %r8
- add $4, n
- jnz L(top)
- mov %r8, %rax
- FUNC_EXIT()
+ add $4, %r9
+ js L(top)
+L(end):
+ je L(3x)
+ cmp $2, %r9
+ jg L(ret)
+ mov $-1, %r9
+ je L(1)
+ jmp L(2)
+L(3x):
+ dec %r9
+ jmp L(3)
+
+L(ret): mov %r8, %rax
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/bdiv_q_1.asm b/gmp/mpn/x86_64/bdiv_q_1.asm
deleted file mode 100644
index 02eacbe6a8..0000000000
--- a/gmp/mpn/x86_64/bdiv_q_1.asm
+++ /dev/null
@@ -1,167 +0,0 @@
-dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
-dnl 1-limb divisor, returning quotient only.
-
-dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012 Free Software
-dnl Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 10
-C AMD K10 10
-C Intel P4 33
-C Intel core2 13.25
-C Intel corei 14
-C Intel atom 42
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`d', `%rcx')
-define(`di', `%r8') C just mpn_pi1_bdiv_q_1
-define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_bdiv_q_1)
- FUNC_ENTRY(4)
- push %rbx
-
- mov %rcx, %rax
- xor R32(%rcx), R32(%rcx) C ncnt count
- mov %rdx, %r10
-
- bt $0, R32(%rax)
- jnc L(evn) C skip bsfq unless divisor is even
-
-L(odd): mov %rax, %rbx
- shr R32(%rax)
- and $127, R32(%rax) C d/2, 7 bits
-
- LEA( binvert_limb_table, %rdx)
-
- movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
-
- mov %rbx, %r11 C d without twos
-
- lea (%rax,%rax), R32(%rdx) C 2*inv
- imul R32(%rax), R32(%rax) C inv*inv
- imul R32(%rbx), R32(%rax) C inv*inv*d
- sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
-
- lea (%rdx,%rdx), R32(%rax) C 2*inv
- imul R32(%rdx), R32(%rdx) C inv*inv
- imul R32(%rbx), R32(%rdx) C inv*inv*d
- sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
-
- lea (%rax,%rax), %r8 C 2*inv
- imul %rax, %rax C inv*inv
- imul %rbx, %rax C inv*inv*d
- sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits
-
- jmp L(com)
-
-L(evn): bsf %rax, %rcx
- shr R8(%rcx), %rax
- jmp L(odd)
-EPILOGUE()
-
-PROLOGUE(mpn_pi1_bdiv_q_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- push %rbx
-
- mov %rcx, %r11 C d
- mov %rdx, %r10 C n
- mov %r9, %rcx C ncnt
-
-L(com): mov (up), %rax C up[0]
-
- dec %r10
- jz L(one)
-
- mov 8(up), %rdx C up[1]
- lea (up,%r10,8), up C up end
- lea (rp,%r10,8), rp C rp end
- neg %r10 C -n
-
- shrd R8(%rcx), %rdx, %rax
-
- xor R32(%rbx), R32(%rbx)
- jmp L(ent)
-
- ALIGN(8)
-L(top):
- C rax q
- C rbx carry bit, 0 or 1
- C rcx ncnt
- C rdx
- C r10 counter, limbs, negative
-
- mul %r11 C carry limb in rdx
- mov (up,%r10,8), %rax
- mov 8(up,%r10,8), %r9
- shrd R8(%rcx), %r9, %rax
- nop
- sub %rbx, %rax C apply carry bit
- setc R8(%rbx)
- sub %rdx, %rax C apply carry limb
- adc $0, %rbx
-L(ent): imul %r8, %rax
- mov %rax, (rp,%r10,8)
- inc %r10
- jnz L(top)
-
- mul %r11 C carry limb in rdx
- mov (up), %rax C up high limb
- shr R8(%rcx), %rax
- sub %rbx, %rax C apply carry bit
- sub %rdx, %rax C apply carry limb
- imul %r8, %rax
- mov %rax, (rp)
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(one): shr R8(%rcx), %rax
- imul %r8, %rax
- mov %rax, (rp)
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/aors_n.asm b/gmp/mpn/x86_64/bobcat/aors_n.asm
deleted file mode 100644
index 22287b8558..0000000000
--- a/gmp/mpn/x86_64/bobcat/aors_n.asm
+++ /dev/null
@@ -1,150 +0,0 @@
-dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
-
-dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bobcat 2.28
-C Intel P4
-C Intel core2
-C Intel NHM
-C Intel SBR
-C Intel IBR
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimization tool suite written by David Harvey and Torbjorn Granlund.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`vp', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
-
-ifdef(`OPERATION_add_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- xor %r8, %r8
-L(ent): test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jnz L(b10)
-
-L(b00): shr $2, n
- neg %r8
- mov $3, R32(%rax)
- mov (up), %r10
- mov 8(up), %r11
- jmp L(lo0)
-
-L(b10): shr $2, n
- neg %r8
- mov $1, R32(%rax)
- mov (up), %r8
- mov 8(up), %r9
- jrcxz L(cj2)
- jmp L(top)
-
-L(bx1): test $2, R8(n)
- jnz L(b11)
-
-L(b01): shr $2, n
- neg %r8
- mov $0, R32(%rax)
- mov (up), %r9
- jrcxz L(cj1)
- mov 8(up), %r10
- jmp L(lo1)
-
- ALIGN(8)
-L(b11): inc n
- shr $2, n
- neg %r8
- mov $2, R32(%rax)
- mov (up), %r11
- jmp L(lo3)
-
- ALIGN(4)
-L(top): mov 8(up,%rax,8), %r10
- ADCSBB -8(vp,%rax,8), %r8
- mov %r8, -8(rp,%rax,8)
-L(lo1): mov 16(up,%rax,8), %r11
- ADCSBB (vp,%rax,8), %r9
- lea 4(%rax), %rax
- mov %r9, -32(rp,%rax,8)
-L(lo0): ADCSBB -24(vp,%rax,8), %r10
- mov %r10, -24(rp,%rax,8)
-L(lo3): ADCSBB -16(vp,%rax,8), %r11
- dec n
- mov -8(up,%rax,8), %r8
- mov %r11, -16(rp,%rax,8)
-L(lo2): mov (up,%rax,8), %r9
- jnz L(top)
-
-L(cj2): ADCSBB -8(vp,%rax,8), %r8
- mov %r8, -8(rp,%rax,8)
-L(cj1): ADCSBB (vp,%rax,8), %r9
- mov %r9, (rp,%rax,8)
-
- mov $0, R32(%rax)
- adc $0, R32(%rax)
-
- FUNC_EXIT()
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/aorsmul_1.asm b/gmp/mpn/x86_64/bobcat/aorsmul_1.asm
deleted file mode 100644
index 415a17cb7f..0000000000
--- a/gmp/mpn/x86_64/bobcat/aorsmul_1.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bobcat.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4.5
-C AMD K10 4.5
-C AMD bd1 4.75
-C AMD bobcat 5
-C Intel P4 17.7
-C Intel core2 5.5
-C Intel NHM 5.43
-C Intel SBR 3.92
-C Intel atom 23
-C VIA nano 5.63
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
-')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-C Standard parameters
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param', `%rdx')
-define(`v0', `%rcx')
-C Standard allocations
-define(`n', `%rbx')
-define(`w0', `%r8')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-
-C DOS64 parameters
-IFDOS(` define(`rp', `%rcx') ') dnl
-IFDOS(` define(`up', `%rsi') ') dnl
-IFDOS(` define(`n_param', `%r8') ') dnl
-IFDOS(` define(`v0', `%r9') ') dnl
-C DOS64 allocations
-IFDOS(` define(`n', `%rbx') ') dnl
-IFDOS(` define(`w0', `%r8') ') dnl
-IFDOS(` define(`w1', `%rdi') ') dnl
-IFDOS(` define(`w2', `%r10') ') dnl
-IFDOS(` define(`w3', `%r11') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
-IFDOS(` push %rsi ')
-IFDOS(` push %rdi ')
-IFDOS(` mov %rdx, %rsi ')
-
- push %rbx
- mov (up), %rax
-
- lea -16(rp,n_param,8), rp
- lea -16(up,n_param,8), up
-
- mov n_param, n
- and $3, R32(n_param)
- jz L(b0)
- cmp $2, R32(n_param)
- ja L(b3)
- jz L(b2)
-
-L(b1): mul v0
- cmp $1, n
- jz L(n1)
- mov %rax, w2
- mov %rdx, w3
- neg n
- add $3, n
- jmp L(L1)
-L(n1): ADDSUB %rax, 8(rp)
- adc $0, %rdx
- mov %rdx, %rax
- pop %rbx
-IFDOS(` pop %rdi ')
-IFDOS(` pop %rsi ')
- ret
-
-L(b3): mul v0
- mov %rax, w2
- mov %rdx, w3
- neg n
- inc n
- jmp L(L3)
-
-L(b0): mul v0
- mov %rax, w0
- mov %rdx, w1
- neg n
- add $2, n
- jmp L(L0)
-
-L(b2): mul v0
- mov %rax, w0
- mov %rdx, w1
- neg n
- jmp L(L2)
-
- ALIGN(16)
-L(top): ADDSUB w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
-L(L1): mov 0(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- ADDSUB w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(L0): mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- ADDSUB w0, 0(rp,n,8)
- adc w1, w2
- adc $0, w3
-L(L3): mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- ADDSUB w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(L2): mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(top)
-
-L(end): ADDSUB w0, (rp)
- adc w1, w2
- adc $0, w3
- ADDSUB w2, 8(rp)
- adc $0, w3
- mov w3, %rax
-
- pop %rbx
-IFDOS(` pop %rdi ')
-IFDOS(` pop %rsi ')
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/copyd.asm b/gmp/mpn/x86_64/bobcat/copyd.asm
deleted file mode 100644
index 877714e903..0000000000
--- a/gmp/mpn/x86_64/bobcat/copyd.asm
+++ /dev/null
@@ -1,91 +0,0 @@
-dnl AMD64 mpn_copyd optimised for AMD bobcat.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 1
-C AMD K10 1-2 (alignment fluctuations)
-C AMD bd1 ?
-C AMD bobcat 1.5
-C Intel P4 2.8
-C Intel core2 1
-C Intel NHM 1-1.25
-C Intel SBR 1
-C Intel atom 2.87
-C VIA nano 2
-
-C INPUT PARAMETERS
-C rp rdi
-C up rsi
-C n rdx
-
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`n',`%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
- sub $4, n
- jl L(end)
- ALIGN(16)
-L(top): mov 24(up,n,8), %r8
- mov %r8, 24(rp,n,8)
- mov 16(up,n,8), %r8
- mov %r8, 16(rp,n,8)
- mov 8(up,n,8), %r8
- mov %r8, 8(rp,n,8)
- mov (up,n,8), %r8
- mov %r8, (rp,n,8)
-L(ent): sub $4, n
- jge L(top)
-
-L(end): cmp $-4, R32(n)
- jz L(ret)
- mov 24(up,n,8), %r8
- mov %r8, 24(rp,n,8)
- cmp $-3, R32(n)
- jz L(ret)
- mov 16(up,n,8), %r8
- mov %r8, 16(rp,n,8)
- cmp $-2, R32(n)
- jz L(ret)
- mov 8(up,n,8), %r8
- mov %r8, 8(rp,n,8)
-
-L(ret): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/copyi.asm b/gmp/mpn/x86_64/bobcat/copyi.asm
deleted file mode 100644
index ee0f578652..0000000000
--- a/gmp/mpn/x86_64/bobcat/copyi.asm
+++ /dev/null
@@ -1,94 +0,0 @@
-dnl AMD64 mpn_copyi optimised for AMD bobcat.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 1
-C AMD K10 1-2 (alignment fluctuations)
-C AMD bd1 ?
-C AMD bobcat 1.5
-C Intel P4 2.8
-C Intel core2 1
-C Intel NHM 1-1.25
-C Intel SBR 1
-C Intel atom 2.87
-C VIA nano 2
-
-C INPUT PARAMETERS
-C rp rdi
-C up rsi
-C n rdx
-
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`n',`%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
- lea -32(up,n,8), up
- lea -32(rp,n,8), rp
- neg n
- add $4, n
- jg L(end)
- ALIGN(16)
-L(top): mov (up,n,8), %r8
- mov %r8, (rp,n,8)
- mov 8(up,n,8), %r8
- mov %r8, 8(rp,n,8)
- mov 16(up,n,8), %r8
- mov %r8, 16(rp,n,8)
- mov 24(up,n,8), %r8
- mov %r8, 24(rp,n,8)
-L(ent): add $4, n
- jle L(top)
-
-L(end): cmp $4, R32(n)
- jz L(ret)
- mov (up,n,8), %r8
- mov %r8, (rp,n,8)
- cmp $3, R32(n)
- jz L(ret)
- mov 8(up,n,8), %r8
- mov %r8, 8(rp,n,8)
- cmp $2, R32(n)
- jz L(ret)
- mov 16(up,n,8), %r8
- mov %r8, 16(rp,n,8)
-
-L(ret): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/gmp-mparam.h b/gmp/mpn/x86_64/bobcat/gmp-mparam.h
deleted file mode 100644
index de4c4e4528..0000000000
--- a/gmp/mpn/x86_64/bobcat/gmp-mparam.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2012, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-#undef HAVE_NATIVE_mpn_mul_2
-#undef HAVE_NATIVE_mpn_addmul_2
-
-/* 1600 MHz AMD Bobcat Zacate E-350 */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 32
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 43
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 17
-
-#define MUL_TOOM22_THRESHOLD 24
-#define MUL_TOOM33_THRESHOLD 36
-#define MUL_TOOM44_THRESHOLD 268
-#define MUL_TOOM6H_THRESHOLD 396
-#define MUL_TOOM8H_THRESHOLD 517
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 195
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 181
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 103
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 26
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 375
-#define SQR_TOOM6_THRESHOLD 0 /* always */
-#define SQR_TOOM8_THRESHOLD 478
-
-#define MULMID_TOOM42_THRESHOLD 22
-
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 400 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 400, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 20, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \
- { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 49, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 99,10}, { 55,11}, { 31,10}, { 63, 9}, \
- { 127,10}, { 71, 9}, { 143,10}, { 79,11}, \
- { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
- { 143,11}, { 79,10}, { 167,11}, { 95,10}, \
- { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \
- { 287,11}, { 159,10}, { 319,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543,11}, { 287,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 351,10}, { 703,11}, \
- { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
- { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
- { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
- { 607,13}, { 319,12}, { 703,13}, { 383,12}, \
- { 831,13}, { 447,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \
- { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 895,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1151,14}, { 639,13}, { 1407,14}, \
- { 767,13}, { 1663,14}, { 895,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,14}, \
- { 1279,13}, { 2559,14}, { 1407,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 156
-#define MUL_FFT_THRESHOLD 5504
-
-#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
- { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
- { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
- { 55,11}, { 31,10}, { 79,11}, { 47,10}, \
- { 103,12}, { 31,11}, { 63,10}, { 127, 9}, \
- { 255,10}, { 135, 9}, { 271,11}, { 79,10}, \
- { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
- { 383,11}, { 111,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
- { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
- { 319,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,10}, { 415,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,10}, { 575,11}, \
- { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,11}, \
- { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
- { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
- { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 479,14}, { 127,13}, { 255,12}, \
- { 607,13}, { 319,12}, { 703,13}, { 383,12}, \
- { 831,13}, { 447,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \
- { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 895,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1151,14}, { 639,13}, { 1343,12}, \
- { 2687,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \
- { 2687,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 162
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 26
-#define MULLO_MUL_N_THRESHOLD 10950
-
-#define DC_DIV_QR_THRESHOLD 63
-#define DC_DIVAPPR_Q_THRESHOLD 198
-#define DC_BDIV_QR_THRESHOLD 56
-#define DC_BDIV_Q_THRESHOLD 127
-
-#define INV_MULMOD_BNM1_THRESHOLD 46
-#define INV_NEWTON_THRESHOLD 236
-#define INV_APPR_THRESHOLD 204
-
-#define BINV_NEWTON_THRESHOLD 286
-#define REDC_1_TO_REDC_2_THRESHOLD 63
-#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
-
-#define MU_DIV_QR_THRESHOLD 1499
-#define MU_DIVAPPR_Q_THRESHOLD 1499
-#define MUPI_DIV_QR_THRESHOLD 84
-#define MU_BDIV_QR_THRESHOLD 1334
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 1,16,194,904,1167
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 102
-#define HGCD_APPR_THRESHOLD 50
-#define HGCD_REDUCE_THRESHOLD 2681
-#define GCD_DC_THRESHOLD 416
-#define GCDEXT_DC_THRESHOLD 293
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 248
-#define SET_STR_PRECOMPUTE_THRESHOLD 1160
-
-#define FAC_DSC_THRESHOLD 746
-#define FAC_ODD_THRESHOLD 44
diff --git a/gmp/mpn/x86_64/bobcat/mul_1.asm b/gmp/mpn/x86_64/bobcat/mul_1.asm
deleted file mode 100644
index cb58bef0b3..0000000000
--- a/gmp/mpn/x86_64/bobcat/mul_1.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-dnl AMD64 mpn_mul_1 optimised for AMD bobcat.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4.5
-C AMD K10 4.5
-C AMD bd1 4.62
-C AMD bobcat 5
-C Intel P4 14
-C Intel core2 4.5
-C Intel NHM 4.23
-C Intel SBR 3.0
-C Intel atom 21
-C VIA nano 4.94
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-C Standard parameters
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param', `%rdx')
-define(`v0', `%rcx')
-define(`cy', `%r8')
-C Standard allocations
-define(`n', `%rbx')
-define(`w0', `%r8')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-
-C DOS64 parameters
-IFDOS(` define(`rp', `%rcx') ') dnl
-IFDOS(` define(`up', `%rsi') ') dnl
-IFDOS(` define(`n_param', `%r8') ') dnl
-IFDOS(` define(`v0', `%r9') ') dnl
-IFDOS(` define(`cy', `64(%rsp)')') dnl
-C DOS64 allocations
-IFDOS(` define(`n', `%rbx') ') dnl
-IFDOS(` define(`w0', `%r8') ') dnl
-IFDOS(` define(`w1', `%rdi') ') dnl
-IFDOS(` define(`w2', `%r10') ') dnl
-IFDOS(` define(`w3', `%r11') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_1c)
-IFDOS(` push %rsi ')
-IFDOS(` push %rdi ')
-IFDOS(` mov %rdx, %rsi ')
- mov cy, w2
- jmp L(com)
-EPILOGUE()
-
-PROLOGUE(mpn_mul_1)
-IFDOS(` push %rsi ')
-IFDOS(` push %rdi ')
-IFDOS(` mov %rdx, %rsi ')
- xor w2, w2
-L(com): push %rbx
- mov (up), %rax
-
- lea -16(rp,n_param,8), rp
- lea -16(up,n_param,8), up
-
- mov n_param, n
- and $3, R32(n_param)
- jz L(b0)
- cmp $2, R32(n_param)
- ja L(b3)
- jz L(b2)
-
-L(b1): mul v0
- cmp $1, n
- jz L(n1)
- neg n
- add $3, n
- add %rax, w2
- mov %rdx, w3
- jmp L(L1)
-L(n1): add %rax, w2
- mov %rdx, %rax
- mov w2, 8(rp)
- adc $0, %rax
- pop %rbx
-IFDOS(` pop %rdi ')
-IFDOS(` pop %rsi ')
- ret
-
-L(b3): mul v0
- neg n
- inc n
- add %rax, w2
- mov %rdx, w3
- jmp L(L3)
-
-L(b0): mul v0
- mov %rax, w0
- mov %rdx, w1
- neg n
- add $2, n
- add w2, w0
- jmp L(L0)
-
-L(b2): mul v0
- mov %rax, w0
- mov %rdx, w1
- neg n
- add w2, w0
- jmp L(L2)
-
- ALIGN(16)
-L(top): mov w0, -16(rp,n,8)
- add w1, w2
-L(L1): adc $0, w3
- mov 0(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, -8(rp,n,8)
- add w3, w0
-L(L0): adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, 0(rp,n,8)
- add w1, w2
-L(L3): adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, 8(rp,n,8)
- add w3, w0
-L(L2): adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(top)
-
-L(end): mov w0, (rp)
- add w1, w2
- adc $0, w3
- mov w2, 8(rp)
- mov w3, %rax
-
- pop %rbx
-IFDOS(` pop %rdi ')
-IFDOS(` pop %rsi ')
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/mul_basecase.asm b/gmp/mpn/x86_64/bobcat/mul_basecase.asm
deleted file mode 100644
index e7d46bfcff..0000000000
--- a/gmp/mpn/x86_64/bobcat/mul_basecase.asm
+++ /dev/null
@@ -1,486 +0,0 @@
-dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4.5
-C AMD K10 4.5
-C AMD bd1 4.75
-C AMD bobcat 5
-C Intel P4 17.7
-C Intel core2 5.5
-C Intel NHM 5.43
-C Intel SBR 3.92
-C Intel atom 23
-C VIA nano 5.63
-
-C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
-C multiply insn bandwidth, without any apparent loop branch exit pipeline
-C replays experienced on K8. The structure is unusual: it falls into mul_1 in
-C the same way for all n, then it splits into 4 different wind-down blocks and
-C 4 separate addmul_1 loops.
-C
-C We have not tried using the same addmul_1 loops with a switch into feed-in
-C code, as we do in other basecase implementations. Doing that could save
-C substantial code volume, but would also probably add some overhead.
-
-C TODO
-C * Tune un < 3 code.
-C * Fix slowdown for un=vn=3 (67->71) compared to default code.
-C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
-C combining addmul loops like that code. Tolerable slowdown?
-C * Lots of space could be saved by replacing the "switch" code by gradual
-C jumps out from mul_1 winddown code, perhaps with no added overhead.
-C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-C Standard parameters
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param', `%rdx')
-define(`vp', `%rcx')
-define(`vn', `%r8')
-C Standard allocations
-define(`un', `%rbx')
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`n', `%rbp')
-define(`v0', `%r9')
-
-C Temp macro for allowing control over indexing.
-C Define to return $1 for more conservative ptr handling.
-define(`X',`$2')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
-
- mov (up), %rax
- mov (vp), v0
-
- cmp $2, un_param
- ja L(ge3)
- jz L(u2)
-
- mul v0 C u0 x v0
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(u2): mul v0 C u0 x v0
- mov %rax, (rp)
- mov 8(up), %rax
- mov %rdx, w0
- mul v0
- add %rax, w0
- mov %rdx, w1
- adc $0, w1
- cmp $1, R32(vn)
- jnz L(u2v2)
- mov w0, 8(rp)
- mov w1, 16(rp)
- FUNC_EXIT()
- ret
-
-L(u2v2):mov 8(vp), v0
- mov (up), %rax
- mul v0
- add %rax, w0
- mov w0, 8(rp)
- mov %rdx, %r8 C CAUTION: r8 realloc
- adc $0, %r8
- mov 8(up), %rax
- mul v0
- add w1, %r8
- adc $0, %rdx
- add %r8, %rax
- adc $0, %rdx
- mov %rax, 16(rp)
- mov %rdx, 24(rp)
- FUNC_EXIT()
- ret
-
-
-L(ge3): push %rbx
- push %rbp
- push %r12
- push %r13
-
- lea 8(vp), vp
-
- lea -24(rp,un_param,8), rp
- lea -24(up,un_param,8), up
- xor R32(un), R32(un)
- mov $2, R32(n)
- sub un_param, un
- sub un_param, n
-
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(L3)
-
- ALIGN(16)
-L(top): mov w0, -16(rp,n,8)
- add w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, -8(rp,n,8)
- add w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, (rp,n,8)
- add w1, w2
- adc $0, w3
-L(L3): mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, 8(rp,n,8)
- add w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(top)
-
- mov w0, -16(rp,n,8)
- add w1, w2
- adc $0, w3
-
-C Switch on n into right addmul_l loop
- test n, n
- jz L(r2)
- cmp $2, R32(n)
- ja L(r3)
- jz L(r0)
- jmp L(r1)
-
-
-L(r3): mov w2, X(-8(rp,n,8),16(rp))
- mov w3, X((rp,n,8),24(rp))
- add $2, un
-
-C outer loop(3)
-L(to3): dec vn
- jz L(ret)
- mov (vp), v0
- mov 8(up,un,8), %rax
- lea 8(vp), vp
- lea 8(rp), rp
- mov un, n
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(al3)
-
- ALIGN(16)
-L(ta3): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
-L(al3): mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta3)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
- jmp L(to3)
-
-
-L(r2): mov X(0(up,n,8),(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),-8(rp))
- add w3, w0
- adc $0, w1
- mov X(8(up,n,8),8(up)), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, X((rp,n,8),(rp))
- add w1, w2
- adc $0, w3
- mov X(16(up,n,8),16(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(8(rp,n,8),8(rp))
- add w3, w0
- adc $0, w1
- mov w0, X(16(rp,n,8),16(rp))
- adc $0, w3
- mov w1, X(24(rp,n,8),24(rp))
- inc un
-
-C outer loop(2)
-L(to2): dec vn
- jz L(ret)
- mov (vp), v0
- mov 16(up,un,8), %rax
- lea 8(vp), vp
- lea 8(rp), rp
- mov un, n
- mul v0
- mov %rax, w0
- mov %rdx, w1
- jmp L(al2)
-
- ALIGN(16)
-L(ta2): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(al2): mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta2)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
- jmp L(to2)
-
-
-L(r1): mov X(0(up,n,8),8(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),(rp))
- add w3, w0
- adc $0, w1
- mov X(8(up,n,8),16(up)), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, X((rp,n,8),8(rp))
- add w1, w2
- adc $0, w3
- mov w2, X(8(rp,n,8),16(rp))
- mov w3, X(16(rp,n,8),24(rp))
- add $4, un
-
-C outer loop(1)
-L(to1): dec vn
- jz L(ret)
- mov (vp), v0
- mov -8(up,un,8), %rax
- lea 8(vp), vp
- lea 8(rp), rp
- mov un, n
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(al1)
-
- ALIGN(16)
-L(ta1): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
-L(al1): mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta1)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
- jmp L(to1)
-
-
-L(r0): mov X((up,n,8),16(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),8(rp))
- add w3, w0
- adc $0, w1
- mov w0, X((rp,n,8),16(rp))
- mov w1, X(8(rp,n,8),24(rp))
- add $3, un
-
-C outer loop(0)
-L(to0): dec vn
- jz L(ret)
- mov (vp), v0
- mov (up,un,8), %rax
- lea 8(vp), vp
- lea 8(rp), rp
- mov un, n
- mul v0
- mov %rax, w0
- mov %rdx, w1
- jmp L(al0)
-
- ALIGN(16)
-L(ta0): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(al0): mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta0)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
- jmp L(to0)
-
-
-L(ret): pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/bobcat/redc_1.asm b/gmp/mpn/x86_64/bobcat/redc_1.asm
deleted file mode 100644
index c3798021f7..0000000000
--- a/gmp/mpn/x86_64/bobcat/redc_1.asm
+++ /dev/null
@@ -1,502 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for AMD bobcat.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat 5.0
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-define(`w0', `%rbp')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea (mp_param,n,8), mp
- lea (up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(bx0)
-
-L(bx1): test $2, R8(n)
- jz L(b3)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
-
-L(otp1):lea 1(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- mov 8(mp,n,8), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, w1
- add (up,n,8), w2
- adc w3, %rbx
- adc $0, w1
- mov 16(mp,n,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- adc w1, w2
- adc $0, w3
- imul u0inv, %rbx C next q limb
- jmp L(e1)
-
- ALIGNx
-L(tp1): add w0, -16(up,i,8)
- adc w1, w2
- adc $0, w3
- mov (mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 8(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add w0, (up,i,8)
- adc w1, w2
- adc $0, w3
-L(e1): mov 16(mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 24(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add $4, i
- js L(tp1)
-
-L(ed1): add w0, I(-16(up),-16(up,i,8))
- adc w1, w2
- adc $0, w3
- add w2, I(-8(up),-8(up,i,8))
- adc $0, w3
- mov w3, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b3): cmp $-3, R32(n)
- jz L(n3)
-
-L(otp3):lea 3(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- mov 8(mp,n,8), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, w1
- add (up,n,8), w2
- adc w3, %rbx
- adc $0, w1
- mov 16(mp,n,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- adc w1, w2
- adc $0, w3
- imul u0inv, %rbx C next q limb
- jmp L(e3)
-
- ALIGNx
-L(tp3): add w0, -16(up,i,8)
- adc w1, w2
- adc $0, w3
-L(e3): mov (mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 8(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add w0, (up,i,8)
- adc w1, w2
- adc $0, w3
- mov 16(mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 24(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add $4, i
- js L(tp3)
-
-L(ed3): add w0, I(-16(up),-16(up,i,8))
- adc w1, w2
- adc $0, w3
- add w2, I(-8(up),-8(up,i,8))
- adc $0, w3
- mov w3, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp3)
-C jmp L(cj)
-
-L(cj):
-IFSTD(` lea (up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea (up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(bx0): test $2, R8(n)
- jnz L(b2)
-
-L(b0):
-L(otp0):lea (n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- mov 8(mp,n,8), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, w3
- add (up,n,8), w0
- adc w1, %rbx
- adc $0, w3
- mov 16(mp,n,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- adc w3, w0
- adc $0, w1
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): add w0, -16(up,i,8)
- adc w1, w2
- adc $0, w3
- mov (mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 8(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add w0, (up,i,8)
- adc w1, w2
- adc $0, w3
- mov 16(mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(up,i,8)
- adc w3, w0
- adc $0, w1
-L(e0): mov 24(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add $4, i
- js L(tp0)
-
-L(ed0): add w0, I(-16(up),-16(up,i,8))
- adc w1, w2
- adc $0, w3
- add w2, I(-8(up),-8(up,i,8))
- adc $0, w3
- mov w3, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
- jmp L(cj)
-
-L(b2): cmp $-2, R32(n)
- jz L(n2)
-
-L(otp2):lea 2(n), i
- mov (mp,n,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- mov 8(mp,n,8), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, w3
- add (up,n,8), w0
- adc w1, %rbx
- adc $0, w3
- mov 16(mp,n,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add 8(up,n,8), %rbx
- mov %rbx, 8(up,n,8)
- adc w3, w0
- adc $0, w1
- imul u0inv, %rbx C next q limb
- jmp L(e2)
-
- ALIGNx
-L(tp2): add w0, -16(up,i,8)
- adc w1, w2
- adc $0, w3
- mov (mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(up,i,8)
- adc w3, w0
- adc $0, w1
-L(e2): mov 8(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add w0, (up,i,8)
- adc w1, w2
- adc $0, w3
- mov 16(mp,i,8), %rax
- mul q0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(up,i,8)
- adc w3, w0
- adc $0, w1
- mov 24(mp,i,8), %rax
- mul q0
- mov %rax, w2
- mov %rdx, w3
- add $4, i
- js L(tp2)
-
-L(ed2): add w0, I(-16(up),-16(up,i,8))
- adc w1, w2
- adc $0, w3
- add w2, I(-8(up),-8(up,i,8))
- adc $0, w3
- mov w3, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp2)
- jmp L(cj)
-
-L(n1): mov (mp_param), %rax
- mul q0
- add -8(up), %rax
- adc (up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov -8(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -16(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov (up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 8(up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -24(mp), %rax
- mov -24(up), %r10
- mul q0
- add %rax, %r10
- mov -16(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- add %r11, %rbp
- mov -8(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, -16(up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, -8(up)
- mov %r11, -24(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
-
- mov -48(up), %rdx
- mov -40(up), %rbx
- xor R32(%rax), R32(%rax)
- add %rbp, %rdx
- adc %r10, %rbx
- adc -8(up), %r11
- mov %rdx, (rp)
- mov %rbx, 8(rp)
- mov %r11, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/bobcat/sqr_basecase.asm b/gmp/mpn/x86_64/bobcat/sqr_basecase.asm
deleted file mode 100644
index 0e417a1ebe..0000000000
--- a/gmp/mpn/x86_64/bobcat/sqr_basecase.asm
+++ /dev/null
@@ -1,565 +0,0 @@
-dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4.5
-C AMD K10 4.5
-C AMD bd1 4.75
-C AMD bobcat 5
-C Intel P4 17.7
-C Intel core2 5.5
-C Intel NHM 5.43
-C Intel SBR 3.92
-C Intel atom 23
-C VIA nano 5.63
-
-C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
-C multiply insn bandwidth, without any apparent loop branch exit pipeline
-C replays experienced on K8. The structure is unusual: it falls into mul_1 in
-C the same way for all n, then it splits into 4 different wind-down blocks and
-C 4 separate addmul_1 loops.
-C
-C We have not tried using the same addmul_1 loops with a switch into feed-in
-C code, as we do in other basecase implementations. Doing that could save
-C substantial code volume, but would also probably add some overhead.
-
-C TODO
-C * Tune un < 4 code.
-C * Perhaps implement a larger final corner (it is now 2 x 1).
-C * Lots of space could be saved by replacing the "switch" code by gradual
-C jumps out from mul_1 winddown code, perhaps with no added overhead.
-C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-C Standard parameters
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param', `%rdx')
-C Standard allocations
-define(`un', `%rbx')
-define(`w0', `%r8')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-define(`n', `%rbp')
-define(`v0', `%rcx')
-
-C Temp macro for allowing control over indexing.
-C Define to return $1 for more conservative ptr handling.
-define(`X',`$2')
-dnl define(`X',`$1')
-
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- mov (up), %rax
-
- cmp $2, R32(un_param)
- jae L(ge2)
-
- mul %rax
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(ge2): mov (up), v0
- jnz L(g2)
-
- mul %rax
- mov %rax, (rp)
- mov 8(up), %rax
- mov %rdx, w0
- mul v0
- add %rax, w0
- mov %rdx, w1
- adc $0, w1
- mov 8(up), v0
- mov (up), %rax
- mul v0
- add %rax, w0
- mov w0, 8(rp)
- mov %rdx, w0 C CAUTION: r8 realloc
- adc $0, w0
- mov 8(up), %rax
- mul v0
- add w1, w0
- adc $0, %rdx
- add w0, %rax
- adc $0, %rdx
- mov %rax, 16(rp)
- mov %rdx, 24(rp)
- FUNC_EXIT()
- ret
-
-L(g2): cmp $3, R32(un_param)
- ja L(g3)
- mul %rax
- mov %rax, (rp)
- mov %rdx, 8(rp)
- mov 8(up), %rax
- mul %rax
- mov %rax, 16(rp)
- mov %rdx, 24(rp)
- mov 16(up), %rax
- mul %rax
- mov %rax, 32(rp)
- mov %rdx, 40(rp)
-
- mov (up), v0
- mov 8(up), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov 16(up), %rax
- mul v0
- xor R32(w2), R32(w2)
- add %rax, w1
- adc %rdx, w2
-
- mov 8(up), v0
- mov 16(up), %rax
- mul v0
- xor R32(w3), R32(w3)
- add %rax, w2
- adc %rdx, w3
- add w0, w0
- adc w1, w1
- adc w2, w2
- adc w3, w3
- mov $0, R32(v0)
- adc v0, v0
- add w0, 8(rp)
- adc w1, 16(rp)
- adc w2, 24(rp)
- adc w3, 32(rp)
- adc v0, 40(rp)
- FUNC_EXIT()
- ret
-
-L(g3): push %rbx
- push %rbp
-
- mov 8(up), %rax
- lea -24(rp,un_param,8), rp
- lea -24(up,un_param,8), up
- neg un_param
- push un_param C for sqr_diag_addlsh1
- lea (un_param), un
- lea 3(un_param), n
-
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(L3)
-
- ALIGN(16)
-L(top): mov w0, -16(rp,n,8)
- add w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, -8(rp,n,8)
- add w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, (rp,n,8)
- add w1, w2
- adc $0, w3
-L(L3): mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, 8(rp,n,8)
- add w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(top)
-
- mov w0, -16(rp,n,8)
- add w1, w2
- adc $0, w3
-
- test n, n
- jz L(r2)
- cmp $2, R32(n)
- ja L(r3)
- jz L(r0)
-
-
-L(r1): mov X((up,n,8),8(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),(rp))
- add w3, w0
- adc $0, w1
- mov X(8(up,n,8),16(up)), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, X((rp,n,8),8(rp))
- add w1, w2
- adc $0, w3
- mov w2, X(8(rp,n,8),16(rp))
- mov w3, X(16(rp,n,8),24(rp))
- add $5, un
- jmp L(to0)
-
-L(r2): mov X((up,n,8),(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),-8(rp))
- add w3, w0
- adc $0, w1
- mov X(8(up,n,8),8(up)), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- mov w0, X((rp,n,8),(rp))
- add w1, w2
- adc $0, w3
- mov X(16(up,n,8),16(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(8(rp,n,8),8(rp))
- add w3, w0
- adc $0, w1
- mov w0, X(16(rp,n,8),16(rp))
- adc $0, w3
- mov w1, X(24(rp,n,8),24(rp))
- add $6, un
- jmp L(to1)
-
-L(r3): mov w2, X(-8(rp,n,8),16(rp))
- mov w3, X((rp,n,8),24(rp))
- add $3, un
- jmp L(to2)
-
-L(r0): mov X((up,n,8),16(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov w2, X(-8(rp,n,8),8(rp))
- add w3, w0
- adc $0, w1
- mov w0, X((rp,n,8),16(rp))
- mov w1, X(8(rp,n,8),24(rp))
- add $4, un
-C jmp L(to3)
-C fall through into main loop
-
-
-L(outer):
- mov un, n
- mov (up,un,8), v0
- mov 8(up,un,8), %rax
- lea 8(rp), rp
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(al3)
-
- ALIGN(16)
-L(ta3): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
-L(al3): mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta3)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
-
-
-L(to2): mov un, n
- cmp $-4, R32(un)
- jnc L(end)
- add $4, un
- mov 8(up,n,8), v0
- mov 16(up,n,8), %rax
- lea 8(rp), rp
- mul v0
- mov %rax, w0
- mov %rdx, w1
- jmp L(al2)
-
- ALIGN(16)
-L(ta2): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(al2): mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta2)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
-
-
-L(to1): mov un, n
- mov -16(up,un,8), v0
- mov -8(up,un,8), %rax
- lea 8(rp), rp
- mul v0
- mov %rax, w2
- mov %rdx, w3
- jmp L(al1)
-
- ALIGN(16)
-L(ta1): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
-L(al1): mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta1)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
-
-
-L(to0): mov un, n
- mov -8(up,un,8), v0
- mov (up,un,8), %rax
- lea 8(rp), rp
- mul v0
- mov %rax, w0
- mov %rdx, w1
- jmp L(al0)
-
- ALIGN(16)
-L(ta0): add w0, -16(rp,n,8)
- adc w1, w2
- adc $0, w3
- mov (up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, -8(rp,n,8)
- adc w3, w0
- adc $0, w1
-L(al0): mov 8(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, (rp,n,8)
- adc w1, w2
- adc $0, w3
- mov 16(up,n,8), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- add w2, 8(rp,n,8)
- adc w3, w0
- adc $0, w1
- mov 24(up,n,8), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add $4, n
- js L(ta0)
-
- add w0, X(-16(rp,n,8),8(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(-8(rp,n,8),16(rp))
- adc $0, w3
- mov w3, X((rp,n,8),24(rp))
- jmp L(outer)
-
-
-L(end): mov X(8(up,un,8),(up)), v0
- mov X(16(up,un,8),8(up)), %rax
- mul v0
- mov %rax, w0
- mov %rdx, w1
- mov X(24(up,un,8),16(up)), %rax
- mul v0
- mov %rax, w2
- mov %rdx, w3
- add w0, X(24(rp,un,8),16(rp))
- adc w1, w2
- adc $0, w3
- add w2, X(32(rp,un,8),24(rp))
- adc $0, w3
- mov X(16(up,un,8),8(up)), v0
- mov X(24(up,un,8),16(up)), %rax
- mul v0
- add %rax, w3
- mov w3, X(40(rp,un,8),32(rp))
- adc $0, %rdx
- mov %rdx, X(48(rp,un,8),40(rp))
-
-
-C sqr_diag_addlsh1
-
- lea 16(up), up
- lea 40(rp), rp
- pop n
- lea 2(n,n), n
-
- mov (up,n,4), %rax
- mul %rax
- xor R32(w2), R32(w2)
-
- mov 8(rp,n,8), w0
- mov %rax, (rp,n,8)
- jmp L(lm)
-
- ALIGN(8)
-L(tsd): add %rbx, w0
- adc %rax, w1
- mov w0, -8(rp,n,8)
- mov 8(rp,n,8), w0
- mov w1, (rp,n,8)
-L(lm): mov 16(rp,n,8), w1
- adc w0, w0
- adc w1, w1
- lea (%rdx,w2), %rbx
- mov 8(up,n,4), %rax
- setc R8(w2)
- mul %rax
- add $2, n
- js L(tsd)
-
-L(esd): add %rbx, w0
- adc %rax, w1
- mov w0, X(-8(rp,n,8),-8(rp))
- mov w1, X((rp,n,8),(rp))
- adc w2, %rdx
- mov %rdx, X(8(rp,n,8),8(rp))
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/cnd_aors_n.asm b/gmp/mpn/x86_64/cnd_aors_n.asm
deleted file mode 100644
index 13a2ab3be9..0000000000
--- a/gmp/mpn/x86_64/cnd_aors_n.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 2
-C AMD K10 2
-C AMD bd1 2.32
-C AMD bobcat 3
-C Intel P4 13
-C Intel core2 2.9
-C Intel NHM 2.8
-C Intel SBR 2.4
-C Intel atom 5.33
-C VIA nano 3
-
-C NOTES
-C * It might seem natural to use the cmov insn here, but since this function
-C is supposed to have the exact same execution pattern for cnd true and
-C false, and since cmov's documentation is not clear about whether it
-C actually reads both source operands and writes the register for a false
-C condition, we cannot use it.
-C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
-C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
-C ADCSBB-to-memory, again saving 1 insn/limb.
-C * This runs optimally at decoder bandwidth on K10. It has not been tuned
-C for any other processor.
-
-C INPUT PARAMETERS
-define(`cnd', `%rdi') dnl rcx
-define(`rp', `%rsi') dnl rdx
-define(`up', `%rdx') dnl r8
-define(`vp', `%rcx') dnl r9
-define(`n', `%r8') dnl rsp+40
-
-ifdef(`OPERATION_cnd_add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_cnd_add_n)')
-ifdef(`OPERATION_cnd_sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_cnd_sub_n)')
-
-MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), R32(%r8)')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- neg cnd
- sbb cnd, cnd C make cnd mask
-
- lea (vp,n,8), vp
- lea (up,n,8), up
- lea (rp,n,8), rp
-
- mov R32(n), R32(%rax)
- neg n
- and $3, R32(%rax)
- jz L(top) C carry-save reg rax = 0 in this arc
- cmp $2, R32(%rax)
- jc L(b1)
- jz L(b2)
-
-L(b3): mov (vp,n,8), %r12
- mov 8(vp,n,8), %r13
- mov 16(vp,n,8), %r14
- and cnd, %r12
- mov (up,n,8), %r10
- and cnd, %r13
- mov 8(up,n,8), %rbx
- and cnd, %r14
- mov 16(up,n,8), %rbp
- ADDSUB %r12, %r10
- mov %r10, (rp,n,8)
- ADCSBB %r13, %rbx
- mov %rbx, 8(rp,n,8)
- ADCSBB %r14, %rbp
- mov %rbp, 16(rp,n,8)
- sbb R32(%rax), R32(%rax) C save carry
- add $3, n
- js L(top)
- jmp L(end)
-
-L(b2): mov (vp,n,8), %r12
- mov 8(vp,n,8), %r13
- mov (up,n,8), %r10
- and cnd, %r12
- mov 8(up,n,8), %rbx
- and cnd, %r13
- ADDSUB %r12, %r10
- mov %r10, (rp,n,8)
- ADCSBB %r13, %rbx
- mov %rbx, 8(rp,n,8)
- sbb R32(%rax), R32(%rax) C save carry
- add $2, n
- js L(top)
- jmp L(end)
-
-L(b1): mov (vp,n,8), %r12
- mov (up,n,8), %r10
- and cnd, %r12
- ADDSUB %r12, %r10
- mov %r10, (rp,n,8)
- sbb R32(%rax), R32(%rax) C save carry
- add $1, n
- jns L(end)
-
- ALIGN(16)
-L(top): mov (vp,n,8), %r12
- mov 8(vp,n,8), %r13
- mov 16(vp,n,8), %r14
- mov 24(vp,n,8), %r11
- and cnd, %r12
- mov (up,n,8), %r10
- and cnd, %r13
- mov 8(up,n,8), %rbx
- and cnd, %r14
- mov 16(up,n,8), %rbp
- and cnd, %r11
- mov 24(up,n,8), %r9
- add R32(%rax), R32(%rax) C restore carry
- ADCSBB %r12, %r10
- mov %r10, (rp,n,8)
- ADCSBB %r13, %rbx
- mov %rbx, 8(rp,n,8)
- ADCSBB %r14, %rbp
- mov %rbp, 16(rp,n,8)
- ADCSBB %r11, %r9
- mov %r9, 24(rp,n,8)
- sbb R32(%rax), R32(%rax) C save carry
- add $4, n
- js L(top)
-
-L(end): neg R32(%rax)
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/com.asm b/gmp/mpn/x86_64/com.asm
deleted file mode 100644
index 006acaf648..0000000000
--- a/gmp/mpn/x86_64/com.asm
+++ /dev/null
@@ -1,95 +0,0 @@
-dnl AMD64 mpn_com.
-
-dnl Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 1.25
-C AMD K10 1.25
-C Intel P4 2.78
-C Intel core2 1.1
-C Intel corei 1.5
-C Intel atom ?
-C VIA nano 2
-
-C INPUT PARAMETERS
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`n',`%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_com)
- FUNC_ENTRY(3)
- movq (up), %r8
- movl R32(%rdx), R32(%rax)
- leaq (up,n,8), up
- leaq (rp,n,8), rp
- negq n
- andl $3, R32(%rax)
- je L(b00)
- cmpl $2, R32(%rax)
- jc L(b01)
- je L(b10)
-
-L(b11): notq %r8
- movq %r8, (rp,n,8)
- decq n
- jmp L(e11)
-L(b10): addq $-2, n
- jmp L(e10)
- .byte 0x90,0x90,0x90,0x90,0x90,0x90
-L(b01): notq %r8
- movq %r8, (rp,n,8)
- incq n
- jz L(ret)
-
-L(oop): movq (up,n,8), %r8
-L(b00): movq 8(up,n,8), %r9
- notq %r8
- notq %r9
- movq %r8, (rp,n,8)
- movq %r9, 8(rp,n,8)
-L(e11): movq 16(up,n,8), %r8
-L(e10): movq 24(up,n,8), %r9
- notq %r8
- notq %r9
- movq %r8, 16(rp,n,8)
- movq %r9, 24(rp,n,8)
- addq $4, n
- jnc L(oop)
-L(ret): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/com_n.asm b/gmp/mpn/x86_64/com_n.asm
new file mode 100644
index 0000000000..fba9384642
--- /dev/null
+++ b/gmp/mpn/x86_64/com_n.asm
@@ -0,0 +1,77 @@
+dnl AMD64 mpn_com_n.
+
+dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8,K9: 1.25
+C K10: 1.25
+C P4: 2.78
+C P6-15: 1.1
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_com_n)
+ movq (up), %r8
+ movl %edx, %eax
+ leaq (up,n,8), up
+ leaq (rp,n,8), rp
+ negq n
+ andl $3, %eax
+ je L(b00)
+ cmpl $2, %eax
+ jc L(b01)
+ je L(b10)
+
+L(b11): notq %r8
+ movq %r8, (rp,n,8)
+ decq n
+ jmp L(e11)
+L(b10): addq $-2, n
+ jmp L(e10)
+ .byte 0x90,0x90,0x90,0x90,0x90,0x90
+L(b01): notq %r8
+ movq %r8, (rp,n,8)
+ incq n
+ jz L(ret)
+
+L(oop): movq (up,n,8), %r8
+L(b00): movq 8(up,n,8), %r9
+ notq %r8
+ notq %r9
+ movq %r8, (rp,n,8)
+ movq %r9, 8(rp,n,8)
+L(e11): movq 16(up,n,8), %r8
+L(e10): movq 24(up,n,8), %r9
+ notq %r8
+ notq %r9
+ movq %r8, 16(rp,n,8)
+ movq %r9, 24(rp,n,8)
+ addq $4, n
+ jnc L(oop)
+L(ret): ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/copyd.asm b/gmp/mpn/x86_64/copyd.asm
index a5e6e595e7..759b11d3ed 100644
--- a/gmp/mpn/x86_64/copyd.asm
+++ b/gmp/mpn/x86_64/copyd.asm
@@ -1,93 +1,74 @@
dnl AMD64 mpn_copyd -- copy limb vector, decrementing.
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C AMD K8,K9 1
-C AMD K10 1
-C AMD bd1 1.36
-C AMD bobcat 1.71
-C Intel P4 2-3
-C Intel core2 1
-C Intel NHM 1
-C Intel SBR 1
-C Intel atom 2
-C VIA nano 2
+C cycles/limb
+C K8,K9: 1
+C K10: 1
+C P4: 2.8
+C P6-15: 1.2
-IFSTD(`define(`rp',`%rdi')')
-IFSTD(`define(`up',`%rsi')')
-IFSTD(`define(`n', `%rdx')')
-IFDOS(`define(`rp',`%rcx')')
-IFDOS(`define(`up',`%rdx')')
-IFDOS(`define(`n', `%r8')')
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
ASM_START()
TEXT
- ALIGN(64)
+ ALIGN(16)
PROLOGUE(mpn_copyd)
- lea -8(up,n,8), up
- lea (rp,n,8), rp
- sub $4, n
+ leaq -8(up,n,8), up
+ leaq (rp,n,8), rp
+ subq $4, n
jc L(end)
- nop
-
-L(top): mov (up), %rax
- mov -8(up), %r9
- lea -32(rp), rp
- mov -16(up), %r10
- mov -24(up), %r11
- lea -32(up), up
- mov %rax, 24(rp)
- mov %r9, 16(rp)
- sub $4, n
- mov %r10, 8(rp)
- mov %r11, (rp)
- jnc L(top)
+ ALIGN(16)
+L(oop): movq (up), %r8
+ movq -8(up), %r9
+ leaq -32(rp), rp
+ movq -16(up), %r10
+ movq -24(up), %r11
+ leaq -32(up), up
+ movq %r8, 24(rp)
+ movq %r9, 16(rp)
+ subq $4, n
+ movq %r10, 8(rp)
+ movq %r11, (rp)
+ jnc L(oop)
-L(end): shr R32(n)
+L(end): shrl %edx C edx = lowpart(n)
jnc 1f
- mov (up), %rax
- mov %rax, -8(rp)
- lea -8(rp), rp
- lea -8(up), up
-1: shr R32(n)
+ movq (up), %r8
+ movq %r8, -8(rp)
+ leaq -8(rp), rp
+ leaq -8(up), up
+1: shrl %edx C edx = lowpart(n)
jnc 1f
- mov (up), %rax
- mov -8(up), %r9
- mov %rax, -8(rp)
- mov %r9, -16(rp)
+ movq (up), %r8
+ movq -8(up), %r9
+ movq %r8, -8(rp)
+ movq %r9, -16(rp)
1: ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/copyi.asm b/gmp/mpn/x86_64/copyi.asm
index bafce7a09e..506142be79 100644
--- a/gmp/mpn/x86_64/copyi.asm
+++ b/gmp/mpn/x86_64/copyi.asm
@@ -1,92 +1,73 @@
dnl AMD64 mpn_copyi -- copy limb vector, incrementing.
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C AMD K8,K9 1
-C AMD K10 1
-C AMD bd1 1.36
-C AMD bobcat 1.71
-C Intel P4 2-3
-C Intel core2 1
-C Intel NHM 1
-C Intel SBR 1
-C Intel atom 2
-C VIA nano 2
+C cycles/limb
+C K8,K9: 1
+C K10: 1
+C P4: 2.8
+C P6-15: 1.2
-IFSTD(`define(`rp',`%rdi')')
-IFSTD(`define(`up',`%rsi')')
-IFSTD(`define(`n', `%rdx')')
-IFDOS(`define(`rp',`%rcx')')
-IFDOS(`define(`up',`%rdx')')
-IFDOS(`define(`n', `%r8')')
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
ASM_START()
TEXT
- ALIGN(64)
- .byte 0,0,0,0,0,0
+ ALIGN(16)
PROLOGUE(mpn_copyi)
- lea -8(rp), rp
- sub $4, n
+ leaq -8(rp), rp
+ subq $4, n
jc L(end)
+ ALIGN(16)
+L(oop): movq (up), %r8
+ movq 8(up), %r9
+ leaq 32(rp), rp
+ movq 16(up), %r10
+ movq 24(up), %r11
+ leaq 32(up), up
+ movq %r8, -24(rp)
+ movq %r9, -16(rp)
+ subq $4, n
+ movq %r10, -8(rp)
+ movq %r11, (rp)
+ jnc L(oop)
-L(top): mov (up), %rax
- mov 8(up), %r9
- lea 32(rp), rp
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- mov %rax, -24(rp)
- mov %r9, -16(rp)
- sub $4, n
- mov %r10, -8(rp)
- mov %r11, (rp)
- jnc L(top)
-
-L(end): shr R32(n)
+L(end): shrl %edx C edx = lowpart(n)
jnc 1f
- mov (up), %rax
- mov %rax, 8(rp)
- lea 8(rp), rp
- lea 8(up), up
-1: shr R32(n)
+ movq (up), %r8
+ movq %r8, 8(rp)
+ leaq 8(rp), rp
+ leaq 8(up), up
+1: shrl %edx C edx = lowpart(n)
jnc 1f
- mov (up), %rax
- mov 8(up), %r9
- mov %rax, 8(rp)
- mov %r9, 16(rp)
+ movq (up), %r8
+ movq 8(up), %r9
+ movq %r8, 8(rp)
+ movq %r9, 16(rp)
1: ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
deleted file mode 100644
index 7066bb4372..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh1_n)')
-ifdef(`OPERATION_rsblsh1_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh1_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
deleted file mode 100644
index 5065120857..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
-dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func, mpn_addlsh2_n)')
-ifdef(`OPERATION_rsblsh2_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func, mpn_rsblsh2_n)')
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aorrlsh_n.asm b/gmp/mpn/x86_64/core2/aorrlsh_n.asm
deleted file mode 100644
index 57abf31579..0000000000
--- a/gmp/mpn/x86_64/core2/aorrlsh_n.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/gmp/mpn/x86_64/core2/aors_err1_n.asm b/gmp/mpn/x86_64/core2/aors_err1_n.asm
deleted file mode 100644
index 3f875aefa4..0000000000
--- a/gmp/mpn/x86_64/core2/aors_err1_n.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 4.14
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`ep', `%rcx')
-define(`yp', `%r8')
-define(`n', `%r9')
-define(`cy_param', `8(%rsp)')
-
-define(`el', `%rbx')
-define(`eh', `%rbp')
-define(`t0', `%r10')
-define(`t1', `%r11')
-define(`t2', `%r12')
-define(`t3', `%r13')
-define(`w0', `%r14')
-define(`w1', `%r15')
-
-ifdef(`OPERATION_add_err1_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_err1_n)')
-ifdef(`OPERATION_sub_err1_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_err1_n)')
-
-MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov cy_param, %rax
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
-
- mov R32(n), R32(%r10)
- and $3, R32(%r10)
- jz L(0mod4)
- cmp $2, R32(%r10)
- jc L(1mod4)
- jz L(2mod4)
-L(3mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- xor R32(t1), R32(t1)
- lea -24(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 16(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc 8(yp), t0
- mov 16(up,n,8), w0
- ADCSBB 16(vp,n,8), w0
- mov w0, 16(rp,n,8)
- cmovc (yp), t1
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
-
- add $3, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(0mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea (yp,n,8), yp
- neg n
- jmp L(loop)
-
- ALIGN(16)
-L(1mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- lea -8(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc (yp), el
- setc %al C save carry
-
- add $1, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(16)
-L(2mod4):
- xor R32(el), R32(el)
- xor R32(eh), R32(eh)
- xor R32(t0), R32(t0)
- lea -16(yp,n,8), yp
- neg n
-
- shr $1, %al C restore carry
- mov (up,n,8), w0
- mov 8(up,n,8), w1
- ADCSBB (vp,n,8), w0
- mov w0, (rp,n,8)
- cmovc 8(yp), el
- ADCSBB 8(vp,n,8), w1
- mov w1, 8(rp,n,8)
- cmovc (yp), t0
- setc %al C save carry
- add t0, el
- adc $0, eh
-
- add $2, n
- jnz L(loop)
- jmp L(end)
-
- ALIGN(32)
-L(loop):
- mov (up,n,8), w0
- shr $1, %al C restore carry
- mov -8(yp), t0
- mov $0, R32(t3)
- ADCSBB (vp,n,8), w0
- cmovnc t3, t0
- mov w0, (rp,n,8)
- mov 8(up,n,8), w1
- mov 16(up,n,8), w0
- ADCSBB 8(vp,n,8), w1
- mov -16(yp), t1
- cmovnc t3, t1
- mov -24(yp), t2
- mov w1, 8(rp,n,8)
- ADCSBB 16(vp,n,8), w0
- cmovnc t3, t2
- mov 24(up,n,8), w1
- ADCSBB 24(vp,n,8), w1
- cmovc -32(yp), t3
- setc %al C save carry
- add t0, el
- adc $0, eh
- add t1, el
- adc $0, eh
- add t2, el
- adc $0, eh
- lea -32(yp), yp
- mov w0, 16(rp,n,8)
- add t3, el
- adc $0, eh
- add $4, n
- mov w1, -8(rp,n,8)
- jnz L(loop)
-
-L(end):
- mov el, (ep)
- mov eh, 8(ep)
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aors_n.asm b/gmp/mpn/x86_64/core2/aors_n.asm
index 74a1bce48a..d26af866f9 100644
--- a/gmp/mpn/x86_64/core2/aors_n.asm
+++ b/gmp/mpn/x86_64/core2/aors_n.asm
@@ -1,45 +1,30 @@
-dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+dnl Copyright 2006, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2
-C AMD K10 2
-C Intel P4 10
-C Intel core2 2
-C Intel NHM 2
-C Intel SBR 2
-C Intel atom 9
-C VIA nano 3
+C K8,K9: 2.25
+C K10: 2
+C P4: 10
+C P6-15: 2.05
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -59,83 +44,80 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
+
TEXT
ALIGN(16)
+
+PROLOGUE(func_nc)
+ jmp L(start)
+EPILOGUE()
+
PROLOGUE(func)
- FUNC_ENTRY(4)
xor %r8, %r8
L(start):
mov (up), %r10
mov (vp), %r11
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
- mov R32(n), R32(%rax)
+ lea -8(up,n,8), up
+ lea -8(vp,n,8), vp
+ lea -16(rp,n,8), rp
+ mov %ecx, %eax
neg n
- and $3, R32(%rax)
+ and $3, %eax
je L(b00)
- add %rax, n C clear low rcx bits for jrcxz
- cmp $2, R32(%rax)
+ add %rax, n C clear low rcx bits for jrcxz
+ cmp $2, %eax
jl L(b01)
je L(b10)
-L(b11): neg %r8 C set cy
+L(b11): shr %r8 C set cy
jmp L(e11)
-L(b00): neg %r8 C set cy
+L(b00): shr %r8 C set cy
mov %r10, %r8
mov %r11, %r9
lea 4(n), n
jmp L(e00)
- nop
- nop
- nop
-L(b01): neg %r8 C set cy
- jmp L(top)
+L(b01): shr %r8 C set cy
+ jmp L(e01)
-L(b10): neg %r8 C set cy
+L(b10): shr %r8 C set cy
mov %r10, %r8
mov %r11, %r9
jmp L(e10)
L(end): ADCSBB %r11, %r10
- mov %r10, -8(rp)
- mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+ mov %r10, 8(rp)
+ mov %ecx, %eax C clear eax, ecx contains 0
+ adc %eax, %eax
ret
ALIGN(16)
-L(top): jrcxz L(end)
- mov (up,n,8), %r8
- mov (vp,n,8), %r9
- lea 4(n), n
- ADCSBB %r11, %r10
- mov %r10, -40(rp,n,8)
-L(e00): mov -24(up,n,8), %r10
- mov -24(vp,n,8), %r11
- ADCSBB %r9, %r8
- mov %r8, -32(rp,n,8)
-L(e11): mov -16(up,n,8), %r8
- mov -16(vp,n,8), %r9
+L(top):
+ mov -24(up,n,8), %r8
+ mov -24(vp,n,8), %r9
ADCSBB %r11, %r10
mov %r10, -24(rp,n,8)
-L(e10): mov -8(up,n,8), %r10
- mov -8(vp,n,8), %r11
+L(e00):
+ mov -16(up,n,8), %r10
+ mov -16(vp,n,8), %r11
ADCSBB %r9, %r8
mov %r8, -16(rp,n,8)
+L(e11):
+ mov -8(up,n,8), %r8
+ mov -8(vp,n,8), %r9
+ ADCSBB %r11, %r10
+ mov %r10, -8(rp,n,8)
+L(e10):
+ mov (up,n,8), %r10
+ mov (vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, (rp,n,8)
+L(e01):
+ jrcxz L(end)
+ lea 4(n), n
jmp L(top)
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- jmp L(start)
EPILOGUE()
-
diff --git a/gmp/mpn/x86_64/core2/sublshC_n.asm b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
index 5acc46b032..18db7c96f8 100644
--- a/gmp/mpn/x86_64/core2/sublshC_n.asm
+++ b/gmp/mpn/x86_64/core2/aorslsh1_n.asm
@@ -1,45 +1,29 @@
-dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << 1), optimised for Core 2 and
-dnl Core iN.
+dnl x86-64 mpn_addlsh1_n and mpn_sublsh1_n, optimized for "Core" 2.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 3
-C Intel NHM 3.1
-C Intel SBR 2.47
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: ?
+C P4: ?
+C P6-15: 3
C INPUT PARAMETERS
define(`rp',`%rdi')
@@ -47,11 +31,21 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_sublsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_sublsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
ASM_START()
TEXT
ALIGN(8)
PROLOGUE(func)
- FUNC_ENTRY(4)
push %rbx
push %r12
@@ -64,7 +58,7 @@ PROLOGUE(func)
xor R32(%r11), R32(%r11)
mov -24(vp,n,8), %r8 C do first limb early
- shrd $RSH, %r8, %r11
+ shrd $63, %r8, %r11
and $3, R32(%rax)
je L(b0)
@@ -73,9 +67,9 @@ PROLOGUE(func)
je L(b2)
L(b3): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -8(vp,n,8), %r10
- shrd $RSH, %r10, %r9
+ shrd $63, %r10, %r9
mov -24(up,n,8), %r12
ADDSUB %r11, %r12
mov %r12, -24(rp,n,8)
@@ -101,7 +95,7 @@ L(b1): mov -24(up,n,8), %r12
jmp L(end)
L(b2): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -24(up,n,8), %r12
ADDSUB %r11, %r12
mov %r12, -24(rp,n,8)
@@ -116,13 +110,13 @@ L(b2): mov -16(vp,n,8), %r9
ALIGN(16)
L(top): mov -24(vp,n,8), %r8
- shrd $RSH, %r8, %r11
+ shrd $63, %r8, %r11
L(b0): mov -16(vp,n,8), %r9
- shrd $RSH, %r9, %r8
+ shrd $63, %r9, %r8
mov -8(vp,n,8), %r10
- shrd $RSH, %r10, %r9
+ shrd $63, %r10, %r9
mov (vp,n,8), %rbx
- shrd $RSH, %rbx, %r10
+ shrd $63, %rbx, %r10
add R32(%rax), R32(%rax) C restore cy
@@ -148,11 +142,10 @@ L(b0): mov -16(vp,n,8), %r9
add $4, n
js L(top)
-L(end): shr $RSH, %r11
+L(end): add %r11, %r11
pop %r12
pop %rbx
- sub R32(%r11), R32(%rax)
+ sbb $0, R32(%rax)
neg R32(%rax)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/aorsmul_1.asm b/gmp/mpn/x86_64/core2/aorsmul_1.asm
index 6b313dd836..1d05b30b59 100644
--- a/gmp/mpn/x86_64/core2/aorsmul_1.asm
+++ b/gmp/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,46 +1,29 @@
dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
-dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4
-C AMD K10 4
-C AMD bd1 5.1
-C AMD bobcat
-C Intel P4 ?
-C Intel core2 4.3-4.5 (fluctuating)
-C Intel NHM 5.0
-C Intel SBR 4.1
-C Intel atom ?
-C VIA nano 5.25
+C K8,K9: 4
+C K10: 4
+C P4: ?
+C P6-15: 4.3-4.7 (fluctuating)
C INPUT PARAMETERS
define(`rp', `%rdi')
@@ -50,129 +33,111 @@ define(`v0', `%rcx')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
- define(`func_1c', `mpn_addmul_1c')
+ define(`func', `mpn_addmul_1')
')
ifdef(`OPERATION_submul_1',`
define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
- define(`func_1c', `mpn_submul_1c')
+ define(`func', `mpn_submul_1')
')
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
- C For DOS, on the stack we have four saved registers, return address,
- C space for four register arguments, and finally the carry input.
-
-IFDOS(` define(`carry_in', `72(%rsp)')') dnl
-IFSTD(` define(`carry_in', `%r8')') dnl
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
ASM_START()
TEXT
ALIGN(16)
-PROLOGUE(func_1c)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- lea (%rdx), %rbx
- neg %rbx
-
- mov (up), %rax
- mov (rp), %r10
-
- lea -16(rp,%rdx,8), rp
- lea (up,%rdx,8), up
- mul %rcx
- add carry_in, %rax
- adc $0, %rdx
- jmp L(start_nc)
-EPILOGUE()
-
- ALIGN(16)
PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- lea (%rdx), %rbx
- neg %rbx
+ push %r15
+ push %r12
+ push %r13
+ lea (%rdx), %r15
+ neg %r15
mov (up), %rax
- mov (rp), %r10
- lea -16(rp,%rdx,8), rp
+ bt $0, %r15
+ jc L(odd)
+
+ lea (rp,%rdx,8), rp
lea (up,%rdx,8), up
mul %rcx
-L(start_nc):
- bt $0, R32(%rbx)
- jc L(odd)
-
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- lea (%rdx), %rbp
- mul %rcx
- add $2, %rbx
+ mov 8(up,%r15,8), %rax
+ mov (rp,%r15,8), %r13
+ lea (%rdx), %r12
+
+ add $2, %r15
jns L(n2)
+ mul %rcx
lea (%rax), %r8
- mov (up,%rbx,8), %rax
+ mov (up,%r15,8), %rax
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
- jmp L(mid)
+ jmp L(m)
-L(odd): add $1, %rbx
+L(odd): lea (rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+ add $1, %r15
jns L(n1)
- lea (%rax), %r8
- mov (up,%rbx,8), %rax
+L(gt1): lea (%rax), %r8
+ mov (up,%r15,8), %rax
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
mul %rcx
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- lea (%rdx), %rbp
- jmp L(e)
+ mov 8(up,%r15,8), %rax
+ mov (rp,%r15,8), %r13
+ lea (%rdx), %r12
+ add $2, %r15
+ jns L(end)
ALIGN(16)
L(top): mul %rcx
ADDSUB %r8, %r10
lea (%rax), %r8
- mov (up,%rbx,8), %rax
+ mov 0(up,%r15,8), %rax
adc %r9, %r11
- mov %r10, -8(rp,%rbx,8)
- mov (rp,%rbx,8), %r10
+ mov %r10, -24(rp,%r15,8)
+ mov -8(rp,%r15,8), %r10
lea (%rdx), %r9
- adc $0, %rbp
-L(mid): mul %rcx
- ADDSUB %r11, %r10
+ adc $0, %r12
+L(m): mul %rcx
+ ADDSUB %r11, %r13
lea (%rax), %r11
- mov 8(up,%rbx,8), %rax
- adc %rbp, %r8
- mov %r10, (rp,%rbx,8)
- mov 8(rp,%rbx,8), %r10
- lea (%rdx), %rbp
+ mov 8(up,%r15,8), %rax
+ adc %r12, %r8
+ mov %r13, -16(rp,%r15,8)
+ mov 0(rp,%r15,8), %r13
+ lea (%rdx), %r12
adc $0, %r9
-L(e): add $2, %rbx
+
+ add $2, %r15
js L(top)
- mul %rcx
+L(end): mul %rcx
ADDSUB %r8, %r10
adc %r9, %r11
- mov %r10, -8(rp)
- adc $0, %rbp
-L(n2): mov (rp), %r10
- ADDSUB %r11, %r10
- adc %rbp, %rax
- mov %r10, (rp)
+ mov %r10, -24(rp,%r15,8)
+ mov -8(rp,%r15,8), %r10
+ adc $0, %r12
+L(r): ADDSUB %r11, %r13
+ adc %r12, %rax
+ mov %r13, -16(rp,%r15,8)
adc $0, %rdx
-L(n1): mov 8(rp), %r10
- ADDSUB %rax, %r10
- mov %r10, 8(rp)
- mov R32(%rbx), R32(%rax) C zero rax
+L(x): ADDSUB %rax, %r10
+ mov %r10, -8(rp,%r15,8)
+ mov $0, %eax
adc %rdx, %rax
- pop %rbp
- pop %rbx
- FUNC_EXIT()
+L(ret): pop %r13
+ pop %r12
+ pop %r15
ret
+L(n2): mul %rcx
+ mov -8(rp,%r15,8), %r10
+ jmp L(r)
+L(n1): mov -8(rp,%r15,8), %r10
+ jmp L(x)
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/copyd.asm b/gmp/mpn/x86_64/core2/copyd.asm
deleted file mode 100644
index f0dc54a55e..0000000000
--- a/gmp/mpn/x86_64/core2/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/copyi.asm b/gmp/mpn/x86_64/core2/copyi.asm
deleted file mode 100644
index 9c26e00c52..0000000000
--- a/gmp/mpn/x86_64/core2/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/core2/divrem_1.asm b/gmp/mpn/x86_64/core2/divrem_1.asm
deleted file mode 100644
index 623bea386c..0000000000
--- a/gmp/mpn/x86_64/core2/divrem_1.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-dnl x86-64 mpn_divrem_1 -- mpn by limb division.
-
-dnl Copyright 2004, 2005, 2007-2010, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C norm unorm frac
-C AMD K8,K9 15 15 12
-C AMD K10 15 15 12
-C Intel P4 44 44 43
-C Intel core2 24 24 19.5
-C Intel corei 19 19 18
-C Intel atom 51 51 36
-C VIA nano 46 44 22.5
-
-C mp_limb_t
-C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
-C mp_srcptr np, mp_size_t nn, mp_limb_t d)
-
-C mp_limb_t
-C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
-C mp_srcptr np, mp_size_t nn, mp_limb_t d,
-C mp_limb_t dinv, int cnt)
-
-C INPUT PARAMETERS
-define(`qp', `%rdi')
-define(`fn_param', `%rsi')
-define(`up_param', `%rdx')
-define(`un_param', `%rcx')
-define(`d', `%r8')
-define(`dinv', `%r9') C only for mpn_preinv_divrem_1
-C shift passed on stack C only for mpn_preinv_divrem_1
-
-define(`cnt', `%rcx')
-define(`up', `%rsi')
-define(`fn', `%r12')
-define(`un', `%rbx')
-
-
-C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C cnt qp d dinv
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFSTD(`define(`CNTOFF', `40($1)')')
-IFDOS(`define(`CNTOFF', `104($1)')')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_preinv_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- xor R32(%rax), R32(%rax)
- push %r13
- push %r12
- push %rbp
- push %rbx
-
- mov fn_param, fn
- mov un_param, un
- add fn_param, un_param
- mov up_param, up
-
- lea -8(qp,un_param,8), qp
-
- mov CNTOFF(%rsp), R8(cnt)
- shl R8(cnt), d
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- xor R32(%rax), R32(%rax)
- push %r13
- push %r12
- push %rbp
- push %rbx
-
- mov fn_param, fn
- mov un_param, un
- add fn_param, un_param
- mov up_param, up
- je L(ret)
-
- lea -8(qp,un_param,8), qp
- xor R32(%rbp), R32(%rbp)
-
-L(unnormalized):
- test un, un
- je L(44)
- mov -8(up,un,8), %rax
- cmp d, %rax
- jae L(44)
- mov %rbp, (qp)
- mov %rax, %rbp
- lea -8(qp), qp
- je L(ret)
- dec un
-L(44):
- bsr d, %rcx
- not R32(%rcx)
- sal R8(%rcx), d
- sal R8(%rcx), %rbp
-
- push %rcx
-IFSTD(` push %rdi ')
-IFSTD(` push %rsi ')
- push %r8
-IFSTD(` mov d, %rdi ')
-IFDOS(` mov d, %rcx ')
- CALL( mpn_invert_limb)
- pop %r8
-IFSTD(` pop %rsi ')
-IFSTD(` pop %rdi ')
- pop %rcx
-
- mov %rax, dinv
- mov %rbp, %rax
- test un, un
- je L(frac)
-L(ent): mov -8(up,un,8), %rbp
- shr R8(%rcx), %rax
- shld R8(%rcx), %rbp, %rax
- sub $2, un
- js L(end)
-
- ALIGN(16)
-L(top): lea 1(%rax), %r11
- mul dinv
- mov (up,un,8), %r10
- shld R8(%rcx), %r10, %rbp
- mov %rbp, %r13
- add %rax, %r13
- adc %r11, %rdx
- mov %rdx, %r11
- imul d, %rdx
- sub %rdx, %rbp
- lea (d,%rbp), %rax
- sub $8, qp
- cmp %r13, %rbp
- cmovc %rbp, %rax
- adc $-1, %r11
- cmp d, %rax
- jae L(ufx)
-L(uok): dec un
- mov %r11, 8(qp)
- mov %r10, %rbp
- jns L(top)
-
-L(end): lea 1(%rax), %r11
- sal R8(%rcx), %rbp
- mul dinv
- add %rbp, %rax
- adc %r11, %rdx
- mov %rax, %r11
- mov %rdx, %r13
- imul d, %rdx
- sub %rdx, %rbp
- mov d, %rax
- add %rbp, %rax
- cmp %r11, %rbp
- cmovc %rbp, %rax
- adc $-1, %r13
- cmp d, %rax
- jae L(efx)
-L(eok): mov %r13, (qp)
- sub $8, qp
- jmp L(frac)
-
-L(ufx): sub d, %rax
- inc %r11
- jmp L(uok)
-L(efx): sub d, %rax
- inc %r13
- jmp L(eok)
-
-L(frac):mov d, %rbp
- neg %rbp
- jmp L(fent)
-
- ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
-L(ftop):mul dinv C 0,12 0,17 0,17
- add %r11, %rdx C 5 8 10
- mov %rax, %r11 C 4 8 3
- mov %rdx, %r13 C 6 9 11
- imul %rbp, %rdx C 6 9 11
- mov d, %rax C
- add %rdx, %rax C 10 14 14
- cmp %r11, %rdx C 10 14 14
- cmovc %rdx, %rax C 11 15 15
- adc $-1, %r13 C
- mov %r13, (qp) C
- sub $8, qp C
-L(fent):lea 1(%rax), %r11 C
- dec fn C
- jns L(ftop) C
-
- shr R8(%rcx), %rax
-L(ret): pop %rbx
- pop %rbp
- pop %r12
- pop %r13
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gcd_1.asm b/gmp/mpn/x86_64/core2/gcd_1.asm
deleted file mode 100644
index e0cab9b4e4..0000000000
--- a/gmp/mpn/x86_64/core2/gcd_1.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-dnl AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 8.50
-C AMD K10 4.30
-C AMD bd1 5.00
-C AMD bobcat 10.0
-C Intel P4 18.6
-C Intel core2 3.83
-C Intel NHM 5.17
-C Intel SBR 4.69
-C Intel atom 17.0
-C VIA nano 5.44
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C * Optimise inner-loop for specific CPUs.
-C * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function. It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
- define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- or v0, %rax
- bsf %rax, %rax C min(ctz(u0),ctz(v0))
-
- bsf v0, %rcx
- shr R8(%rcx), v0
-
- push %rax C preserve common twos over call
- push v0 C preserve v0 argument over call
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(reduced)
- jmp L(bmod)
-
-L(reduce_nby1):
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_mod_1)
- jmp L(reduced)
-L(bmod):
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_modexact_1_odd)
-L(reduced):
-
- add $STACK_ALLOC, %rsp
- pop %rdx
-
- bsf %rax, %rcx
-C test %rax, %rax C FIXME: does this lower latency?
- jnz L(mid)
- jmp L(end)
-
- ALIGN(16) C K10 BD C2 NHM SBR
-L(top): cmovc %r10, %rax C if x-y < 0 0,3 0,3 0,6 0,5 0,5
- cmovc %r9, %rdx C use x,y-x 0,3 0,3 2,8 1,7 1,7
-L(mid): shr R8(%rcx), %rax C 1,7 1,6 2,8 2,8 2,8
- mov %rdx, %r10 C 1 1 4 3 3
- sub %rax, %r10 C 2 2 5 4 4
- bsf %r10, %rcx C 3 3 6 5 5
- mov %rax, %r9 C 2 2 3 3 4
- sub %rdx, %rax C 2 2 4 3 4
- jnz L(top) C
-
-L(end): pop %rcx
- mov %rdx, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/gmp-mparam.h b/gmp/mpn/x86_64/core2/gmp-mparam.h
index 0f4f88f780..8207da4895 100644
--- a/gmp/mpn/x86_64/core2/gmp-mparam.h
+++ b/gmp/mpn/x86_64/core2/gmp-mparam.h
@@ -1,217 +1,78 @@
-/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+/* "Core 2" gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
-or both in parallel, as here.
+/* 2133 MHz "Core 2" / 65nm / 4096 Kibyte cache / socket 775 */
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2133 MHz Core 2 (65nm) */
-/* FFT tuning limit = 60000000 */
-/* Generated by tuneup.c, 2014-03-13, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 24
-
-#define MUL_TOOM22_THRESHOLD 23
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 179
-#define MUL_TOOM6H_THRESHOLD 268
-#define MUL_TOOM8H_THRESHOLD 357
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 102
-#define SQR_TOOM4_THRESHOLD 160
-#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 296
-
-#define MULMID_TOOM42_THRESHOLD 28
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 13
-
-#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 32, 8}, \
- { 17, 7}, { 36, 8}, { 19, 7}, { 40, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 63,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 135, 9}, { 271,11}, \
- { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \
- { 95,10}, { 191, 9}, { 383,10}, { 207,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 271,11}, \
- { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 383,11}, { 207,10}, { 415,11}, { 223,13}, \
- { 63,12}, { 127,11}, { 271,10}, { 543,11}, \
- { 287,10}, { 575,11}, { 303,10}, { 607,12}, \
- { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
- { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
- { 703,13}, { 191,12}, { 415,11}, { 831,12}, \
- { 479,14}, { 127,13}, { 255,12}, { 607,13}, \
- { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
- { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
- { 1023,13}, { 575,12}, { 1215,13}, { 639,12}, \
- { 1279,13}, { 703,14}, { 383,13}, { 831,12}, \
- { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \
- { 1151,14}, { 639,13}, { 1343,14}, { 767,13}, \
- { 1599,14}, { 895,15}, { 511,14}, { 1279,13}, \
- { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
- { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
- { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \
- { 2175,12}, { 8959,14}, { 2303,13}, { 4607,12}, \
- { 9471,14}, { 2431,13}, { 4863,12}, { 9983,15}, \
- { 1279,14}, { 2559,12}, { 10239,14}, { 2687,12}, \
- { 11775,15}, { 1535,14}, { 3327,13}, { 6655,14}, \
- { 3455,13}, { 6911,14}, { 3583,12}, { 14335,11}, \
- { 28671,10}, { 57343,11}, { 2048,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 183
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
- { 9, 5}, { 19, 6}, { 23, 7}, { 12, 6}, \
- { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 55,10}, { 31, 9}, { 67,10}, \
- { 39, 9}, { 79,10}, { 47,11}, { 31,10}, \
- { 79,11}, { 47,10}, { 95,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
- { 271, 8}, { 543,11}, { 79, 9}, { 319, 8}, \
- { 639,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
- { 303, 9}, { 607,10}, { 319, 9}, { 639,11}, \
- { 175,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,10}, { 415,13}, { 63,12}, { 127,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
- { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
- { 351,12}, { 191,11}, { 415,10}, { 831,12}, \
- { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 607,12}, { 319,11}, { 639,12}, { 351,13}, \
- { 191,12}, { 415,11}, { 831,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 607,13}, { 319,12}, \
- { 703,13}, { 383,12}, { 831,13}, { 447,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1023,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1279,13}, \
- { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \
- { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \
- { 2687,13}, { 1407,12}, { 2815,14}, { 767,13}, \
- { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,15}, { 767,14}, { 1535,13}, { 3071,14}, \
- { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,16}, { 511,15}, { 1023,14}, { 2175,13}, \
- { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \
- { 4863,15}, { 1279,14}, { 2815,13}, { 5631,14}, \
- { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
- { 6911,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 179
-#define SQR_FFT_THRESHOLD 3008
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 9174
-
-#define DC_DIV_QR_THRESHOLD 46
-#define DC_DIVAPPR_Q_THRESHOLD 155
-#define DC_BDIV_QR_THRESHOLD 50
-#define DC_BDIV_Q_THRESHOLD 94
-
-#define INV_MULMOD_BNM1_THRESHOLD 48
-#define INV_NEWTON_THRESHOLD 156
-#define INV_APPR_THRESHOLD 155
-
-#define BINV_NEWTON_THRESHOLD 234
-#define REDC_1_TO_REDC_2_THRESHOLD 22
-#define REDC_2_TO_REDC_N_THRESHOLD 48
-
-#define MU_DIV_QR_THRESHOLD 1187
-#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 74
-#define MU_BDIV_QR_THRESHOLD 1017
-#define MU_BDIV_Q_THRESHOLD 1187
-
-#define POWM_SEC_TABLE 1,64,131,269,466
-
-#define MATRIX22_STRASSEN_THRESHOLD 19
-#define HGCD_THRESHOLD 117
-#define HGCD_APPR_THRESHOLD 151
-#define HGCD_REDUCE_THRESHOLD 2121
-#define GCD_DC_THRESHOLD 427
-#define GCDEXT_DC_THRESHOLD 342
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 11
-#define GET_STR_PRECOMPUTE_THRESHOLD 18
-#define SET_STR_DC_THRESHOLD 552
-#define SET_STR_PRECOMPUTE_THRESHOLD 1561
-
-#define FAC_DSC_THRESHOLD 656
-#define FAC_ODD_THRESHOLD 23
+/* Generated by tuneup.c, 2009-01-14, gcc 4.2 */
+
+#define MUL_KARATSUBA_THRESHOLD 18
+#define MUL_TOOM3_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 166
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 163
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 20
+#define MULLOW_MUL_N_THRESHOLD 232
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 60
+#define POWM_THRESHOLD 77
+
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD_THRESHOLD 140
+#define GCD_DC_THRESHOLD 691
+#define GCDEXT_DC_THRESHOLD 760
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 3
+#define MOD_1_2_THRESHOLD 5
+#define MOD_1_4_THRESHOLD 20
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 668
+#define SET_STR_PRECOMPUTE_THRESHOLD 2052
+
+#define MUL_FFT_TABLE { 336, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD 352
+#define MUL_FFT_THRESHOLD 3456
+
+#define SQR_FFT_TABLE { 336, 736, 1728, 3328, 7168, 20480, 81920, 327680, 0 }
+#define SQR_FFT_MODF_THRESHOLD 352
+#define SQR_FFT_THRESHOLD 2432
+
+/* Generated 2009-01-12, gcc 4.2 */
+
+#define MUL_FFT_TABLE2 {{1,4}, {273,5}, {545,6}, {1217,7}, {3201,8}, {6913,9}, {7681,8}, {8449,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {39937,9}, {42497,10}, {56321,11}, {63489,10}, {81409,11}, {92161,10}, {93185,11}, {96257,12}, {126977,11}, {131073,10}, {138241,11}, {167937,10}, {169473,11}, {169985,10}, {172033,11}, {195585,9}, {196097,11}, {198657,10}, {208897,11}, {217089,12}, {258049,11}, {261121,9}, {262657,10}, {275457,11}, {302081,10}, {307201,11}, {331777,12}, {389121,11}, {425985,13}, {516097,12}, {520193,11}, {598017,12}, {610305,11}, {614401,12}, {651265,11}, {653313,10}, {654337,11}, {673793,10}, {674817,11}, {677889,10}, {679937,11}, {718849,10}, {719873,12}, {782337,11}, {850945,12}, {913409,11}, {925697,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1244161,12}, {1306625,11}, {1310721,12}, {1327105,11}, {1347585,12}, {1355777,11}, {1366017,12}, {1439745,13}, {1564673,12}, {1835009,14}, {1900545,12}, {1904641,14}, {2080769,13}, {2088961,12}, {2488321,13}, {2613249,12}, {2879489,13}, {2932737,12}, {2940929,13}, {3137537,12}, {3403777,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4978689,13}, {5234689,12}, {5500929,13}, {5758977,14}, {6275073,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9957377,14}, {MP_SIZE_T_MAX, 0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {241,5}, {545,6}, {1345,7}, {3201,8}, {6913,9}, {7681,8}, {8961,9}, {9729,8}, {10497,9}, {13825,10}, {15361,9}, {19969,10}, {23553,9}, {28161,11}, {30721,10}, {31745,9}, {34305,10}, {55297,11}, {63489,10}, {80897,11}, {94209,10}, {97281,12}, {126977,11}, {129025,9}, {130049,10}, {138753,11}, {162817,9}, {164353,11}, {170497,10}, {178177,11}, {183297,10}, {184321,11}, {194561,10}, {208897,12}, {219137,11}, {221185,12}, {258049,11}, {261121,9}, {261633,10}, {267777,9}, {268289,11}, {270337,10}, {274945,9}, {276481,10}, {278529,11}, {292865,9}, {293377,10}, {295937,9}, {296449,10}, {306177,9}, {309249,10}, {310273,11}, {328705,12}, {331777,11}, {335873,12}, {344065,11}, {346113,12}, {352257,11}, {356353,12}, {389121,11}, {395265,10}, {398337,11}, {419841,10}, {421889,11}, {423937,13}, {516097,12}, {520193,11}, {546817,10}, {550913,11}, {561153,10}, {563201,11}, {579585,10}, {585729,11}, {621569,12}, {636929,11}, {638977,12}, {651265,11}, {714753,10}, {716801,11}, {718849,12}, {782337,11}, {849921,12}, {913409,11}, {954369,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,12}, {1437697,13}, {1564673,12}, {1961985,14}, {2080769,13}, {2088961,12}, {2486273,13}, {2613249,12}, {2879489,13}, {3137537,12}, {3272705,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5320705,13}, {5324801,12}, {5447681,13}, {5455873,12}, {5500929,13}, {5758977,14}, {6275073,13}, {6283265,12}, {6549505,13}, {7856129,15}, {8355841,14}, {8372225,13}, {9953281,14}, {MP_SIZE_T_MAX, 0}}
diff --git a/gmp/mpn/x86_64/core2/lshift.asm b/gmp/mpn/x86_64/core2/lshift.asm
index 8ccafeca6c..60518901eb 100644
--- a/gmp/mpn/x86_64/core2/lshift.asm
+++ b/gmp/mpn/x86_64/core2/lshift.asm
@@ -1,83 +1,64 @@
dnl x86-64 mpn_lshift optimized for "Core 2".
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2007 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 4.25
-C Intel P4 14.7
-C Intel core2 1.27
-C Intel NHM 1.375 (up to about n = 260, then 1.5)
-C Intel SBR 1.87
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: 4.25
+C P4: 14.7
+C P6-15: 1.27
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt', `%cl')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
+ mov %edx, %eax
+ and $3, %eax
jne L(nb00)
L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r10, %rax
+ xor %eax, %eax
+ shld %cl, %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
jmp L(00)
L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
+ cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r9, %rax
+ xor %eax, %eax
+ shld %cl, %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -85,65 +66,62 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(cnt), %r9
+L(le1): shl %cl, %r9
mov %r9, (rp)
- FUNC_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
jne L(b11)
L(b10): mov (up), %r8
mov -8(up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r8, %rax
+ xor %eax, %eax
+ shld %cl, %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(cnt), %r9, %r8
+L(le2): shld %cl, %r9, %r8
mov %r8, (rp)
- shl R8(cnt), %r9
+ shl %cl, %r9
mov %r9, -8(rp)
- FUNC_EXIT()
ret
ALIGN(16) C performance critical!
L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r11, %rax
+ xor %eax, %eax
+ shld %cl, %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(cnt), %r8, %r11
+L(top): shld %cl, %r8, %r11
mov (up), %r10
mov %r11, (rp)
-L(10): shld R8(cnt), %r9, %r8
+L(10): shld %cl, %r9, %r8
mov -8(up), %r11
mov %r8, -8(rp)
-L(01): shld R8(cnt), %r10, %r9
+L(01): shld %cl, %r10, %r9
mov -16(up), %r8
mov %r9, -16(rp)
-L(00): shld R8(cnt), %r11, %r10
+L(00): shld %cl, %r11, %r10
mov -24(up), %r9
+ lea -32(up), up
mov %r10, -24(rp)
- add $-32, up
lea -32(rp), rp
sub $4, n
jnc L(top)
-L(end): shld R8(cnt), %r8, %r11
+L(end): shld %cl, %r8, %r11
mov %r11, (rp)
- shld R8(cnt), %r9, %r8
+ shld %cl, %r9, %r8
mov %r8, -8(rp)
- shl R8(cnt), %r9
+ shl %cl, %r9
mov %r9, -16(rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/lshiftc.asm b/gmp/mpn/x86_64/core2/lshiftc.asm
deleted file mode 100644
index 65c7b2f1b8..0000000000
--- a/gmp/mpn/x86_64/core2/lshiftc.asm
+++ /dev/null
@@ -1,159 +0,0 @@
-dnl x86-64 mpn_lshiftc optimized for "Core 2".
-
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 1.5
-C Intel NHM 2.25 (up to about n = 260, then 1.875)
-C Intel SBR 2.25
-C Intel atom ?
-C VIA nano ?
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- lea -8(rp,n,8), rp
- lea -8(up,n,8), up
-
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
- jne L(nb00)
-L(b00): C n = 4, 8, 12, ...
- mov (up), %r10
- mov -8(up), %r11
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r10, %rax
- mov -16(up), %r8
- lea 24(rp), rp
- sub $4, n
- jmp L(00)
-
-L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
- jae L(nb01)
-L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r9, %rax
- sub $2, n
- jb L(le1)
- mov -8(up), %r10
- mov -16(up), %r11
- lea -8(up), up
- lea 16(rp), rp
- jmp L(01)
-L(le1): shl R8(cnt), %r9
- not %r9
- mov %r9, (rp)
- FUNC_EXIT()
- ret
-
-L(nb01):C n = 2, 6, 10, ...
- jne L(b11)
-L(b10): mov (up), %r8
- mov -8(up), %r9
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r8, %rax
- sub $3, n
- jb L(le2)
- mov -16(up), %r10
- lea -16(up), up
- lea 8(rp), rp
- jmp L(10)
-L(le2): shld R8(cnt), %r9, %r8
- not %r8
- mov %r8, (rp)
- shl R8(cnt), %r9
- not %r9
- mov %r9, -8(rp)
- FUNC_EXIT()
- ret
-
- ALIGN(16) C performance critical!
-L(b11): C n = 3, 7, 11, ...
- mov (up), %r11
- mov -8(up), %r8
- xor R32(%rax), R32(%rax)
- shld R8(cnt), %r11, %rax
- mov -16(up), %r9
- lea -24(up), up
- sub $4, n
- jb L(end)
-
- ALIGN(16)
-L(top): shld R8(cnt), %r8, %r11
- mov (up), %r10
- not %r11
- mov %r11, (rp)
-L(10): shld R8(cnt), %r9, %r8
- mov -8(up), %r11
- not %r8
- mov %r8, -8(rp)
-L(01): shld R8(cnt), %r10, %r9
- mov -16(up), %r8
- not %r9
- mov %r9, -16(rp)
-L(00): shld R8(cnt), %r11, %r10
- mov -24(up), %r9
- not %r10
- mov %r10, -24(rp)
- add $-32, up
- lea -32(rp), rp
- sub $4, n
- jnc L(top)
-
-L(end): shld R8(cnt), %r8, %r11
- not %r11
- mov %r11, (rp)
- shld R8(cnt), %r9, %r8
- not %r8
- mov %r8, -8(rp)
- shl R8(cnt), %r9
- not %r9
- mov %r9, -16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mul_basecase.asm b/gmp/mpn/x86_64/core2/mul_basecase.asm
deleted file mode 100644
index d16be852f7..0000000000
--- a/gmp/mpn/x86_64/core2/mul_basecase.asm
+++ /dev/null
@@ -1,975 +0,0 @@
-dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
-dnl It also seems good for Conroe/Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.0 4.0 - 4.18-4.25
-C Intel NHM 3.75 3.8 - 4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
-C | | | |
-C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
-C | / | / | / | /
-C | / | / | / | /
-C | / | / | / | /
-C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
-C _____ _____ _____ _____
-C / \ / \ / \ / \
-C \|/ | \|/ | \|/ | \|/ |
-C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
-C \ /|\ \ /|\ \ /|\ \ /|\
-C \_____/ \_____/ \_____/ \_____/
-
-C TODO
-C * Tune. None done so far.
-C * Currently 2687 bytes, making it smaller would be nice.
-C * Implement some basecases, say for un < 4.
-C * Try zeroing with xor in m2 loops.
-C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C between loop header and wind-down code.
-C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param', `%rdx')
-define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
-define(`vn_param', `%r8')
-
-define(`un', `%r9')
-define(`vn', `(%rsp)')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r12')
-define(`i', `%r13')
-define(`vp', `%r14')
-
-define(`X0', `%r8')
-define(`X1', `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- mov (up), %rax C shared for mul_1 and mul_2
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- mov (vp_param), v0 C shared for mul_1 and mul_2
-
- xor un, un
- sub un_param, un C un = -un_param
-
- lea (up,un_param,8), up
- lea (rp,un_param,8), rp
-
- mul v0 C shared for mul_1 and mul_2
-
- test $1, R8(vn_param)
- jz L(m2)
-
- lea 8(vp_param), vp C FIXME: delay until known needed
-
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):test $2, R8(un)
- jnz L(m1s2)
-
-L(m1s0):
- lea (un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- lea L(do_am0)(%rip), %rbp
- jmp L(m1e0)
-
-L(m1s2):
- lea 2(un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- mul v0
- lea L(do_am2)(%rip), %rbp
- test i, i
- jnz L(m1e2)
- add %rax, w0
- adc $0, %rdx
- mov w0, I(-8(rp),8(rp,un,8))
- mov %rdx, I((rp),16(rp,un,8))
- jmp L(ret2)
-
-L(m1x1):test $2, R8(un)
- jz L(m1s3)
-
-L(m1s1):
- lea 1(un), i
- mov %rax, (rp,un,8)
- test i, i
- jz L(1)
- mov 8(up,un,8), %rax
- mov %rdx, w1 C FIXME: Use lea?
- lea L(do_am1)(%rip), %rbp
- jmp L(m1e1)
-L(1): mov %rdx, I((rp),8(rp,un,8))
- jmp L(ret2)
-
-L(m1s3):
- lea -1(un), i
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- mov %rdx, w1 C FIXME: Use lea?
- lea L(do_am3)(%rip), %rbp
- jmp L(m1e3)
-
- ALIGNx
-L(m1top):
- mul v0
- mov w1, -16(rp,i,8)
-L(m1e2):xor R32(w1), R32(w1)
- add %rax, w0
- mov (up,i,8), %rax
- adc %rdx, w1
- mov w0, -8(rp,i,8)
-L(m1e1):xor R32(w0), R32(w0)
- mul v0
- add %rax, w1
- mov 8(up,i,8), %rax
- adc %rdx, w0
- mov w1, (rp,i,8)
-L(m1e0):xor R32(w1), R32(w1)
- mul v0
- add %rax, w0
- mov 16(up,i,8), %rax
- adc %rdx, w1
- mov w0, 8(rp,i,8)
-L(m1e3):xor R32(w0), R32(w0)
- mul v0
- add %rax, w1
- mov 24(up,i,8), %rax
- adc %rdx, w0
- add $4, i
- js L(m1top)
-
- mul v0
- mov w1, I(-16(rp),-16(rp,i,8))
- add %rax, w0
- adc $0, %rdx
- mov w0, I(-8(rp),-8(rp,i,8))
- mov %rdx, I((rp),(rp,i,8))
-
- dec vn_param
- jz L(ret2)
- lea -8(rp), rp
- jmp *%rbp
-
-L(m2):
- mov 8(vp_param), v1
- lea 16(vp_param), vp C FIXME: delay until known needed
-
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): test $2, R8(un)
- jnz L(b10)
-
-L(b00): lea (un), i
- mov %rax, (rp,un,8)
- mov %rdx, w1 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov $0, R32(w2)
- jmp L(m2e0)
-
-L(b10): lea -2(un), i
- mov %rax, w2 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov %rdx, w3 C FIXME: Use lea?
- mov $0, R32(w0)
- jmp L(m2e2)
-
-L(bx1): test $2, R8(un)
- jz L(b11)
-
-L(b01): lea 1(un), i
- mov %rax, (rp,un,8)
- mov (up,un,8), %rax
- mov %rdx, w0 C FIXME: Use lea?
- mov $0, R32(w1)
- jmp L(m2e1)
-
-L(b11): lea -1(un), i
- mov %rax, w1 C FIXME: Use lea?
- mov (up,un,8), %rax
- mov %rdx, w2 C FIXME: Use lea?
- mov $0, R32(w3)
- jmp L(m2e3)
-
- ALIGNx
-L(m2top0):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
-L(m2e0):mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top0)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am0):
- push %r15
- push vn_param
-
-L(olo0):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
-C lea 0(un), i
- mov un, i
- mul v0
- mov %rax, X0
- mov (up,un,8), %rax
- MOV( %rdx, X1, 2)
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,un,8), w2
- mov %rax, w3
- jmp L(lo0)
-
- ALIGNx
-L(am2top0):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top0)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo0)
-
-L(ret): pop %rax
- pop %r15
-L(ret2):pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top1):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
-L(m2e1):mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top1)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am1):
- push %r15
- push vn_param
-
-L(olo1):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea 1(un), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 128)
- mov (up,un,8), %rax
- mov (rp,un,8), w1
- mul v1
- mov %rax, w2
- mov 8(up,un,8), %rax
- MOV( %rdx, w3, 1)
- jmp L(lo1)
-
- ALIGNx
-L(am2top1):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo1): mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top1)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo1)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top2):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
-L(m2e2):mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top2)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am2):
- push %r15
- push vn_param
-
-L(olo2):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea -2(un), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov (up,un,8), %rax
- mov (rp,un,8), w0
- mul v1
- mov %rax, w1
- lea (%rdx), w2
- mov 8(up,un,8), %rax
- jmp L(lo2)
-
- ALIGNx
-L(am2top2):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
-L(lo2): mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top2)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo2)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-
- ALIGNx
-L(m2top3):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
-L(m2e3):mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top3)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov w0, I((rp),(rp,i,8))
- mov w1, I(8(rp),8(rp,i,8))
-
- add $-2, vn_param
- jz L(ret2)
-
-L(do_am3):
- push %r15
- push vn_param
-
-L(olo3):
- mov (vp), v0
- mov 8(vp), v1
- lea 16(vp), vp
- lea 16(rp), rp
- mov (up,un,8), %rax
- lea -1(un), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (up,un,8), %rax
- mov (rp,un,8), w3
- mul v1
- mov %rax, w0
- MOV( %rdx, w1, 16)
- mov 8(up,un,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(am2top3):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
-L(lo3): mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top3)
-
- mul v1
- add w0, w1
- adc %rax, w2
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add w2, X0
- mov X0, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- addl $-2, vn
- jnz L(olo3)
-
- pop %rax
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/mullo_basecase.asm b/gmp/mpn/x86_64/core2/mullo_basecase.asm
deleted file mode 100644
index 0f03d867f6..0000000000
--- a/gmp/mpn/x86_64/core2/mullo_basecase.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.0 4.18-4.25
-C Intel NHM 3.75 4.06-4.2
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Implement proper cor2, replacing current cor0.
-C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
-C * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param', `%rcx')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r12')
-define(`n', `%r9')
-define(`i', `%r13')
-define(`vp', `%r8')
-
-define(`X0', `%r14')
-define(`X1', `%r15')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
-
- mov (up), %rax
- mov vp_param, vp
-
- cmp $4, n_param
- jb L(small)
-
- mov (vp_param), v0
- push %rbx
- lea (rp,n_param,8), rp C point rp at R[un]
- push %rbp
- lea (up,n_param,8), up C point up right after U's end
- push %r12
- mov $0, R32(n) C FIXME
- sub n_param, n
- push %r13
- mul v0
- mov 8(vp), v1
-
- test $1, R8(n_param)
- jnz L(m2x1)
-
-L(m2x0):test $2, R8(n_param)
- jnz L(m2b2)
-
-L(m2b0):lea (n), i
- mov %rax, (rp,n,8)
- mov %rdx, w1
- mov (up,n,8), %rax
- xor R32(w2), R32(w2)
- jmp L(m2e0)
-
-L(m2b2):lea -2(n), i
- mov %rax, w2
- mov (up,n,8), %rax
- mov %rdx, w3
- xor R32(w0), R32(w0)
- jmp L(m2e2)
-
-L(m2x1):test $2, R8(n_param)
- jnz L(m2b3)
-
-L(m2b1):lea 1(n), i
- mov %rax, (rp,n,8)
- mov (up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- jmp L(m2e1)
-
-L(m2b3):lea -1(n), i
- xor R32(w3), R32(w3)
- mov %rax, w1
- mov %rdx, w2
- mov (up,n,8), %rax
- jmp L(m2e3)
-
- ALIGNx
-L(m2tp):mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
-L(m2e1):mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
-L(m2e0):mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
-L(m2e3):mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
-L(m2e2):mul v1
- mov $0, R32(w1) C FIXME: dead in last iteration
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0 C FIXME: dead in last iteration
- add $4, i
- js L(m2tp)
-
-L(m2ed):imul v0, %rax
- add w3, %rax
- mov %rax, I(-8(rp),-8(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jge L(cor1)
-
- push %r14
- push %r15
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
- mov (up,n,8), %rax
- mul v0
- test $1, R8(n)
- jnz L(a1x1)
-
-L(a1x0):mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (up,n,8), %rax
- mul v1
- test $2, R8(n)
- jnz L(a110)
-
-L(a100):lea (n), i
- mov (rp,n,8), w3
- mov %rax, w0
- MOV( %rdx, w1, 16)
- jmp L(lo0)
-
-L(a110):lea 2(n), i
- mov (rp,n,8), w1
- mov %rax, w2
- mov 8(up,n,8), %rax
- MOV( %rdx, w3, 1)
- jmp L(lo2)
-
-L(a1x1):mov %rax, X0
- MOV( %rdx, X1, 2)
- mov (up,n,8), %rax
- mul v1
- test $2, R8(n)
- jz L(a111)
-
-L(a101):lea 1(n), i
- MOV( %rdx, w0, 4)
- mov (rp,n,8), w2
- mov %rax, w3
- jmp L(lo1)
-
-L(a111):lea -1(n), i
- MOV( %rdx, w2, 64)
- mov %rax, w1
- mov (rp,n,8), w0
- mov 8(up,n,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(top): mul v1
- add w0, w1
- adc %rax, w2
- mov -8(up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo2): mul v0
- add w1, X1
- mov X1, -16(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov -8(up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov -8(rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo1): mov (up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, -8(rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov (up,i,8), %rax
- mov (rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, (rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 8(rp,i,8), w3
- adc $0, X1
- mov 8(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 16(up,i,8), %rax
- adc $0, w2
-L(lo3): mul v0
- add w0, X0
- mov X0, 8(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 16(up,i,8), %rax
- mov 16(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(top)
-
-L(end): imul v1, %rax
- add w0, w1
- adc %rax, w2
- mov I(-8(up),-8(up,i,8)), %rax
- imul v0, %rax
- add w1, X1
- mov X1, I(-16(rp),-16(rp,i,8))
- adc X0, %rax
- mov I(-8(rp),-8(rp,i,8)), w1
- add w1, w2
- add w2, %rax
- mov %rax, I(-8(rp),-8(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jl L(outer)
-
- pop %r15
- pop %r14
-
- jnz L(cor0)
-
-L(cor1):mov (vp), v0
- mov 8(vp), v1
- mov -16(up), %rax
- mul v0 C u0 x v2
- add -16(rp), %rax C FIXME: rp[0] still available in reg?
- adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
- mov -8(up), %rbx
- imul v0, %rbx
- mov -16(up), %rcx
- imul v1, %rcx
- mov %rax, -16(rp)
- add %rbx, %rcx
- add %rdx, %rcx
- mov %rcx, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(cor0):mov (vp), %r11
- imul -8(up), %r11
- add %rax, %r11
- mov %r11, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- cmp $2, n_param
- jae L(gt1)
-L(n1): imul (vp_param), %rax
- mov %rax, (rp)
- FUNC_EXIT()
- ret
-L(gt1): ja L(gt2)
-L(n2): mov (vp_param), %r9
- mul %r9
- mov %rax, (rp)
- mov 8(up), %rax
- imul %r9, %rax
- add %rax, %rdx
- mov 8(vp), %r9
- mov (up), %rcx
- imul %r9, %rcx
- add %rcx, %rdx
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-L(gt2):
-L(n3): mov (vp_param), %r9
- mul %r9 C u0 x v0
- mov %rax, (rp)
- mov %rdx, %r10
- mov 8(up), %rax
- mul %r9 C u1 x v0
- imul 16(up), %r9 C u2 x v0
- add %rax, %r10
- adc %rdx, %r9
- mov 8(vp), %r11
- mov (up), %rax
- mul %r11 C u0 x v1
- add %rax, %r10
- adc %rdx, %r9
- imul 8(up), %r11 C u1 x v1
- add %r11, %r9
- mov %r10, 8(rp)
- mov 16(vp), %r10
- mov (up), %rax
- imul %rax, %r10 C u0 x v2
- add %r10, %r9
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/popcount.asm b/gmp/mpn/x86_64/core2/popcount.asm
index e935cf1892..6c22999ff4 100644
--- a/gmp/mpn/x86_64/core2/popcount.asm
+++ b/gmp/mpn/x86_64/core2/popcount.asm
@@ -3,33 +3,21 @@ dnl x86-64 mpn_popcount optimized for "Core 2".
dnl Copyright 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-MULFUNC_PROLOGUE(mpn_popcount)
include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/core2/redc_1.asm b/gmp/mpn/x86_64/core2/redc_1.asm
deleted file mode 100644
index d0e96ef1cb..0000000000
--- a/gmp/mpn/x86_64/core2/redc_1.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat ?
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core 4.5 (fluctuating)
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-C * Keep up[i] in registers for basecases (might require pushes).
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C X q0' n X rp up u0i mp q0 i j
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea (mp_param,n,8), mp
- lea -16(up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(b0)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
- cmp $-3, R32(n)
- jz L(n3)
-
- push rp
-
-L(otp1):lea 3(n), i
- mov (mp,n,8), %rax
- mul q0
- lea (%rax), %rbp
- mov 8(mp,n,8), %rax
- lea (%rdx), %r9
- mul q0
- lea (%rax), %r11
- mov 16(mp,n,8), %rax
- mov 16(up,n,8), %r10
- lea (%rdx), %rdi
- mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov 24(mp,n,8), %rax
- adc %r9, %r11
- mov 24(up,n,8), %rbx
- lea (%rdx), %r9
- adc $0, %rdi
- mul q0
- add %r11, %rbx
- lea (%rax), %r11
- mov 32(mp,n,8), %rax
- adc %rdi, %rbp
- mov %rbx, 24(up,n,8)
- mov 32(up,n,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- imul u0inv, %rbx C next q limb
- add $2, i
- jns L(ed1)
-
- ALIGNx
-L(tp1): mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %r10, -8(up,i,8)
- mov (up,i,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 8(mp,i,8), %rax
- adc %rdi, %rbp
- mov %r10, (up,i,8)
- mov 8(up,i,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- add $2, i
- js L(tp1)
-
-L(ed1): mul q0
- add %rbp, %r10
- adc %r9, %r11
- mov %r10, I(-8(up),-8(up,i,8))
- mov I((up),(up,i,8)), %r10
- adc $0, %rdi
- add %r11, %r10
- adc %rdi, %rax
- mov %r10, I((up),(up,i,8))
- mov I(8(up),8(up,i,8)), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, I(8(up),8(up,i,8))
- adc $0, %rdx
- mov %rdx, 16(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b0): cmp $-2, R32(n)
- jz L(n2)
- cmp $-4, R32(n)
- jz L(n4)
-
- push rp
-
-L(otp0):lea 4(n), i
- mov (mp,n,8), %rax
- mul q0
- lea (%rax), %r11
- mov 8(mp,n,8), %rax
- lea (%rdx), %rdi
- mul q0
- lea (%rax), %rbp
- mov 16(mp,n,8), %rax
- mov 16(up,n,8), %r10
- lea (%rdx), %r9
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 24(mp,n,8), %rax
- adc %rdi, %rbp
- mov 24(up,n,8), %rbx
- lea (%rdx), %rdi
- adc $0, %r9
- mul q0
- add %rbp, %rbx
- lea (%rax), %rbp
- mov 32(mp,n,8), %rax
- adc %r9, %r11
- mov %rbx, 24(up,n,8)
- mov 32(up,n,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): mul q0
- add %rbp, %r10
- lea (%rax), %rbp
- mov (mp,i,8), %rax
- adc %r9, %r11
- mov %r10, -8(up,i,8)
- mov (up,i,8), %r10
- lea (%rdx), %r9
- adc $0, %rdi
-L(e0): mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov 8(mp,i,8), %rax
- adc %rdi, %rbp
- mov %r10, (up,i,8)
- mov 8(up,i,8), %r10
- lea (%rdx), %rdi
- adc $0, %r9
- add $2, i
- js L(tp0)
-
-L(ed0): mul q0
- add %rbp, %r10
- adc %r9, %r11
- mov %r10, I(-8(up),-8(up,i,8))
- mov I((up),(up,i,8)), %r10
- adc $0, %rdi
- add %r11, %r10
- adc %rdi, %rax
- mov %r10, I((up),(up,i,8))
- mov I(8(up),8(up,i,8)), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, I(8(up),8(up,i,8))
- adc $0, %rdx
- mov %rdx, 16(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
-
-L(cj): lea 16(up), up C FIXME
- pop rp
-L(add_n):
-IFSTD(` lea (up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea (up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(n1): mov (mp_param), %rax
- mul q0
- add 8(up), %rax
- adc 16(up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov (up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov 8(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -16(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov 16(up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 24(up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -24(mp), %rax
- mov -8(up), %r10
- mul q0
- add %rax, %r10
- mov -16(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov (up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- add %r11, %rbp
- mov 8(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, (up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, 8(up)
- mov %r11, -8(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
-
- mov -32(up), %rdx
- mov -24(up), %rbx
- xor R32(%rax), R32(%rax)
- add %rbp, %rdx
- adc %r10, %rbx
- adc 8(up), %r11
- mov %rdx, (rp)
- mov %rbx, 8(rp)
- mov %r11, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n4): mov -32(mp), %rax
- mul q0
- lea (%rax), %r11
- mov -24(mp), %rax
- lea (%rdx), %r14
- mul q0
- lea (%rax), %rbp
- mov -16(mp), %rax
- mov -16(up), %r10
- lea (%rdx), %r9
- mul q0
- add %r11, %r10
- lea (%rax), %r11
- mov -8(mp), %rax
- adc %r14, %rbp
- mov -8(up), %rbx
- lea (%rdx), %r14
- adc $0, %r9
- mul q0
- add %rbp, %rbx
- adc %r9, %r11
- mov %rbx, -8(up)
- mov (up), %r10
- adc $0, %r14
- imul u0inv, %rbx C next q limb
- add %r11, %r10
- adc %r14, %rax
- mov %r10, (up)
- mov 8(up), %r10
- adc $0, %rdx
- add %rax, %r10
- mov %r10, 8(up)
- adc $0, %rdx
- mov %rdx, -16(up) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(n4)
- lea 16(up), up
- jmp L(add_n)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/core2/rsh1aors_n.asm b/gmp/mpn/x86_64/core2/rsh1aors_n.asm
deleted file mode 100644
index 27eed3712d..0000000000
--- a/gmp/mpn/x86_64/core2/rsh1aors_n.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
-
-dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 3.05
-C Intel NHM 3.3
-C Intel SBR 2.5
-C Intel atom ?
-C VIA nano ?
-
-C TODO
-C * Loopmix to approach 2.5 c/l on NHM.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
-
- neg %r8 C set C flag from parameter
- mov (up), %r8
- ADCSBB (vp), %r8
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %r8
- ADDSUB (vp), %r8
-L(ent): sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %rax
- and $1, R32(%rax) C return value
-
- lea (up,n,8), up
- lea (vp,n,8), vp
- lea (rp,n,8), rp
- mov R32(n), R32(%rbp)
- neg n
- and $3, R32(%rbp)
- jz L(b0)
- cmp $2, R32(%rbp)
- jae L(n1)
-
-L(b1): mov %r8, %rbp
- inc n
- js L(top)
- jmp L(end)
-
-L(n1): jnz L(b3)
- add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %r10
- add $-2, n
- jmp L(2)
-
-L(b3): add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r10
- mov 16(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r10
- ADCSBB 16(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- mov %r8, %r9
- dec n
- jmp L(3)
-
-L(b0): add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up,n,8), %r9
- mov 16(up,n,8), %r10
- mov 24(up,n,8), %r11
- ADCSBB 8(vp,n,8), %r9
- ADCSBB 16(vp,n,8), %r10
- ADCSBB 24(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- jmp L(4)
-
- ALIGN(16)
-
-L(top): add R32(%rbx), R32(%rbx) C restore cy
- mov (up,n,8), %r8
- mov 8(up,n,8), %r9
- mov 16(up,n,8), %r10
- mov 24(up,n,8), %r11
- ADCSBB (vp,n,8), %r8
- ADCSBB 8(vp,n,8), %r9
- ADCSBB 16(vp,n,8), %r10
- ADCSBB 24(vp,n,8), %r11
- sbb R32(%rbx), R32(%rbx) C save cy
- shrd $1, %r8, %rbp
- mov %rbp, -8(rp,n,8)
-L(4): shrd $1, %r9, %r8
- mov %r8, (rp,n,8)
-L(3): shrd $1, %r10, %r9
- mov %r9, 8(rp,n,8)
-L(2): shrd $1, %r11, %r10
- mov %r10, 16(rp,n,8)
-L(1): add $4, n
- mov %r11, %rbp
- js L(top)
-
-L(end): shrd $1, %rbx, %rbp
- mov %rbp, -8(rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/rshift.asm b/gmp/mpn/x86_64/core2/rshift.asm
index ab32ec85df..9a3fc46f9a 100644
--- a/gmp/mpn/x86_64/core2/rshift.asm
+++ b/gmp/mpn/x86_64/core2/rshift.asm
@@ -1,69 +1,50 @@
dnl x86-64 mpn_rshift optimized for "Core 2".
-dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2007 Free Software Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 4.25
-C AMD K10 4.25
-C Intel P4 14.7
-C Intel core2 1.27
-C Intel NHM 1.375 (up to about n = 260, then 1.5)
-C Intel SBR 1.77
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 4.25
+C K10: 4.25
+C P4: 14.7
+C P6-15: 1.27
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt', `%cl')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- mov R32(%rdx), R32(%rax)
- and $3, R32(%rax)
+ mov %edx, %eax
+ and $3, %eax
jne L(nb00)
L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov 8(up), %r11
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r10, %rax
+ xor %eax, %eax
+ shrd %cl, %r10, %rax
mov 16(up), %r8
lea 8(up), up
lea -24(rp), rp
@@ -71,11 +52,11 @@ L(b00): C n = 4, 8, 12, ...
jmp L(00)
L(nb00):C n = 1, 5, 9, ...
- cmp $2, R32(%rax)
+ cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r9, %rax
+ xor %eax, %eax
+ shrd %cl, %r9, %rax
sub $2, n
jb L(le1)
mov 8(up), %r10
@@ -83,65 +64,62 @@ L(b01): mov (up), %r9
lea 16(up), up
lea -16(rp), rp
jmp L(01)
-L(le1): shr R8(cnt), %r9
+L(le1): shr %cl, %r9
mov %r9, (rp)
- FUNC_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
jne L(b11)
L(b10): mov (up), %r8
mov 8(up), %r9
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r8, %rax
+ xor %eax, %eax
+ shrd %cl, %r8, %rax
sub $3, n
jb L(le2)
mov 16(up), %r10
lea 24(up), up
lea -8(rp), rp
jmp L(10)
-L(le2): shrd R8(cnt), %r9, %r8
+L(le2): shrd %cl, %r9, %r8
mov %r8, (rp)
- shr R8(cnt), %r9
+ shr %cl, %r9
mov %r9, 8(rp)
- FUNC_EXIT()
ret
ALIGN(16)
L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov 8(up), %r8
- xor R32(%rax), R32(%rax)
- shrd R8(cnt), %r11, %rax
+ xor %eax, %eax
+ shrd %cl, %r11, %rax
mov 16(up), %r9
lea 32(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shrd R8(cnt), %r8, %r11
+L(top): shrd %cl, %r8, %r11
mov -8(up), %r10
mov %r11, (rp)
-L(10): shrd R8(cnt), %r9, %r8
+L(10): shrd %cl, %r9, %r8
mov (up), %r11
mov %r8, 8(rp)
-L(01): shrd R8(cnt), %r10, %r9
+L(01): shrd %cl, %r10, %r9
mov 8(up), %r8
mov %r9, 16(rp)
-L(00): shrd R8(cnt), %r11, %r10
+L(00): shrd %cl, %r11, %r10
mov 16(up), %r9
+ lea 32(up), up
mov %r10, 24(rp)
- add $32, up
lea 32(rp), rp
sub $4, n
jnc L(top)
-L(end): shrd R8(cnt), %r8, %r11
+L(end): shrd %cl, %r8, %r11
mov %r11, (rp)
- shrd R8(cnt), %r9, %r8
+ shrd %cl, %r9, %r8
mov %r8, 8(rp)
- shr R8(cnt), %r9
+ shr %cl, %r9
mov %r9, 16(rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sec_tabselect.asm b/gmp/mpn/x86_64/core2/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/core2/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/core2/sqr_basecase.asm b/gmp/mpn/x86_64/core2/sqr_basecase.asm
deleted file mode 100644
index a112c1b52e..0000000000
--- a/gmp/mpn/x86_64/core2/sqr_basecase.asm
+++ /dev/null
@@ -1,984 +0,0 @@
-dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
-dnl It also seems good for Conroe/Wolfdale.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core 4.9 4.18-4.25 3.87
-C Intel NHM 3.8 4.06-4.2 3.5
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C Code structure:
-C
-C
-C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
-C | | | |
-C | | | |
-C | | | |
-C \|/ \|/ \|/ \|/
-C ____________ ____________
-C / \ / \
-C \|/ \ \|/ \
-C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
-C \ /|\ \ /|\
-C \____________/ \____________/
-C \ /
-C \ /
-C \ /
-C tail(0m2) tail(1m2)
-C \ /
-C \ /
-C sqr_diag_addlsh1
-
-C TODO
-C * Tune. None done so far.
-C * Currently 2761 bytes, making it smaller would be nice.
-C * Consider using a jumptab-based entry sequence. One might even use a mask-
-C less sequence, if the table is large enough to support tuneup's needs.
-C The code would be, using non-PIC code,
-C lea tab(%rip),%rax; jmp *(n,%rax)
-C or,
-C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
-C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
-C with the last four entries repeated a safe number of times.
-C * Consider expanding feed-in code in order to avoid zeroing registers.
-C * Zero consistently with xor.
-C * Check if using "lea (reg),reg" should be done in more places; we have some
-C explicit "mov %rax,reg" now.
-C * Try zeroing with xor in m2 loops.
-C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
-C between loop header and wind-down code.
-C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-C Define this to $1 to use late loop index variable as zero, $2 to use an
-C explicit $0.
-define(`Z',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param', `%rdx')
-
-define(`n', `%r8')
-
-define(`v0', `%r10')
-define(`v1', `%r11')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r9')
-define(`i', `%r13')
-
-define(`X0', `%r12')
-define(`X1', `%r14')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-define(`N', 85)
-ifdef(`N',,`define(`N',0)')
-define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- cmp $4, n_param
- jl L(small)
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
-
- mov (up), v0
- mov 8(up), %rax
- mov %rax, v1
-
- mov $1, R32(n)
- sub n_param, n C n = -n_param+1
- push n
-
- lea (up,n_param,8), up
- lea (rp,n_param,8), rp
-
- mul v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- mov %rax, (rp,n,8)
- jnz L(b10)
-
-L(b00): lea (n), i C n = 5, 9, ...
- mov %rdx, w1 C FIXME: Use lea?
- xor R32(w2), R32(w2)
- jmp L(m2e0)
-
-L(b10): lea 2(n), i C n = 7, 11, ...
- mov 8(up,n,8), %rax
- mov %rdx, w3 C FIXME: Use lea?
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- jmp L(m2e2)
-
-L(bx1): test $2, R8(n)
- mov %rax, (rp,n,8)
- jz L(b11)
-
-L(b01): lea 1(n), i C n = 6, 10, ...
- mov %rdx, w0 C FIXME: Use lea?
- xor R32(w1), R32(w1)
- jmp L(m2e1)
-
-L(b11): lea -1(n), i C n = 4, 8, 12, ...
- mov %rdx, w2 C FIXME: Use lea?
- xor R32(w3), R32(w3)
- jmp L(m2e3)
-
-
- ALIGNx
-L(m2top1):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
-L(m2e1):mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top1)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- jmp L(am2o3)
-
- ALIGNx
-L(m2top3):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
-L(m2e3):mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top3)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- cmp $-1, n
- jz L(cor1) C jumps iff entry n = 4
-
-L(am2o1):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea 1(n), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 128)
- mov (rp,n,8), w1
- xor R32(w2), R32(w2)
- mov 8(up,n,8), %rax
- xor R32(w3), R32(w3)
- jmp L(lo1)
-
- ALIGNx
-L(am2top1):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
-L(lo1): mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top1)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
-
-L(am2o3):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea -1(n), i
- mul v0
- mov %rax, X1
- MOV( %rdx, X0, 8)
- mov (rp,n,8), w3
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- mov 8(up,n,8), %rax
- jmp L(lo3)
-
- ALIGNx
-L(am2top3):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
-L(lo3): mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top3)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
- cmp $-1, n
- jnz L(am2o1)
-
-L(cor1):pop n
- mov %rdx, w3
- mov -16(up), v0
- mov -8(up), %rax
- mul v0
- add w3, %rax
- adc $0, %rdx
- mov %rax, -8(rp)
- mov %rdx, (rp)
- jmp L(sqr_diag_addlsh1)
-
- ALIGNx
-L(m2top2):
-L(m2e2):mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
- mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top2)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- jmp L(am2o0)
-
- ALIGNx
-L(m2top0):
- mul v0
- add %rax, w3
- mov -8(up,i,8), %rax
- mov w3, -8(rp,i,8)
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov $0, R32(w2)
- mov (up,i,8), %rax
- mul v0
- add %rax, w0
- mov w0, (rp,i,8)
- adc %rdx, w1
- mov (up,i,8), %rax
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- adc %rdx, w2
-L(m2e0):mov 8(up,i,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- adc $0, R32(w3)
- mov 8(up,i,8), %rax
- mul v1
- add %rax, w2
- mov w1, 8(rp,i,8)
- adc %rdx, w3
- mov $0, R32(w0)
- mov 16(up,i,8), %rax
- mul v0
- add %rax, w2
- mov 16(up,i,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- mov $0, R32(w1)
- add %rax, w3
- mov 24(up,i,8), %rax
- mov w2, 16(rp,i,8)
- adc %rdx, w0
- add $4, i
- js L(m2top0)
-
- mul v0
- add %rax, w3
- mov I(-8(up),-8(up,i,8)), %rax
- mov w3, I(-8(rp),-8(rp,i,8))
- adc %rdx, w0
- adc R32(w1), R32(w1)
- mul v1
- add w0, %rax
- adc w1, %rdx
- mov %rax, I((rp),(rp,i,8))
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n C decrease |n|
- cmp $-2, n
- jz L(cor2) C jumps iff entry n = 5
-
-L(am2o2):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea -2(n), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov (rp,n,8), w0
- xor R32(w1), R32(w1)
- xor R32(w2), R32(w2)
- mov 8(up,n,8), %rax
- jmp L(lo2)
-
- ALIGNx
-L(am2top2):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
- mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
-L(lo2): mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top2)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
-
-L(am2o0):
- mov -8(up,n,8), v0
- mov (up,n,8), %rax
- mov %rax, v1
- lea 0(n), i
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 2)
- xor R32(w0), R32(w0)
- mov (rp,n,8), w2
- xor R32(w3), R32(w3)
- jmp L(lo0)
-
- ALIGNx
-L(am2top0):
- mul v1
- add w0, w1
- adc %rax, w2
- mov (up,i,8), %rax
- MOV( %rdx, w3, 1)
- adc $0, w3
- mul v0
- add w1, X1
- mov X1, -8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 2)
- adc $0, X1
- mov (up,i,8), %rax
- mul v1
- MOV( %rdx, w0, 4)
- mov (rp,i,8), w1
- add w1, w2
- adc %rax, w3
- adc $0, w0
-L(lo0): mov 8(up,i,8), %rax
- mul v0
- add w2, X0
- adc %rax, X1
- mov X0, (rp,i,8)
- MOV( %rdx, X0, 8)
- adc $0, X0
- mov 8(up,i,8), %rax
- mov 8(rp,i,8), w2
- mul v1
- add w2, w3
- adc %rax, w0
- MOV( %rdx, w1, 16)
- adc $0, w1
- mov 16(up,i,8), %rax
- mul v0
- add w3, X1
- mov X1, 8(rp,i,8)
- adc %rax, X0
- MOV( %rdx, X1, 32)
- mov 16(rp,i,8), w3
- adc $0, X1
- mov 16(up,i,8), %rax
- mul v1
- add w3, w0
- MOV( %rdx, w2, 64)
- adc %rax, w1
- mov 24(up,i,8), %rax
- adc $0, w2
- mul v0
- add w0, X0
- mov X0, 16(rp,i,8)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov 24(up,i,8), %rax
- mov 24(rp,i,8), w0
- adc $0, X0
- add $4, i
- jnc L(am2top0)
-
- mul v1
- add w0, w1
- adc w2, %rax
- adc Z(i,$0), %rdx
- add w1, X1
- adc Z(i,$0), X0
- mov X1, I(-8(rp),-8(rp,i,8))
- add X0, %rax
- mov %rax, I((rp),(rp,i,8))
- adc Z(i,$0), %rdx
- mov %rdx, I(8(rp),8(rp,i,8))
-
- lea 16(rp), rp
- add $2, n
- cmp $-2, n
- jnz L(am2o2)
-
-L(cor2):pop n
- mov -24(up), v0
- mov %rax, w2
- mov %rdx, w0
- mov -16(up), %rax
- mov %rax, v1
- mul v0
- mov %rax, X0
- MOV( %rdx, X1, 32)
- mov -8(up), %rax
- mul v0
- add w2, X0
- mov X0, -16(rp)
- MOV( %rdx, X0, 128)
- adc %rax, X1
- mov -8(up), %rax
- adc $0, X0
- mul v1
- add w0, X1
- adc $0, X0
- mov X1, -8(rp)
- add X0, %rax
- mov %rax, (rp)
- adc $0, %rdx
- mov %rdx, 8(rp)
- lea 8(rp), rp
-
-L(sqr_diag_addlsh1):
- mov -8(up,n,8), %rax
- shl n
- xor R32(%rbx), R32(%rbx)
- mul %rax
- mov 8(rp,n,8), %r11
- lea (%rdx), %r10
- mov 16(rp,n,8), %r9
- add %r11, %r11
- jmp L(dm)
-
- ALIGNx
-L(dtop):mul %rax
- add %r11, %r10
- mov 8(rp,n,8), %r11
- mov %r10, -8(rp,n,8)
- adc %r9, %rax
- lea (%rdx,%rbx), %r10
- mov 16(rp,n,8), %r9
- adc %r11, %r11
-L(dm): mov %rax, (rp,n,8)
- mov (up,n,4), %rax
- adc %r9, %r9
- setc R8(%rbx)
- add $2, n
- js L(dtop)
-
- mul %rax
- add %r11, %r10
- mov %r10, -8(rp)
- adc %r9, %rax
- lea (%rdx,%rbx), %r10
- mov %rax, (rp)
- adc $0, %r10
- mov %r10, 8(rp)
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- mov (up), %rax
- cmp $2, n_param
- jae L(gt1)
-L(n1):
- mul %rax
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(gt1): jne L(gt2)
-L(n2): mov %rax, %r8
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, %r9
- mul %rax
- mov %rax, %r10
- mov %r11, %rax
- mov %rdx, %r11
- mul %r8
- xor %r8, %r8
- add %rax, %r9
- adc %rdx, %r10
- adc %r8, %r11
- add %rax, %r9
- mov %r9, 8(rp)
- adc %rdx, %r10
- mov %r10, 16(rp)
- adc %r8, %r11
- mov %r11, 24(rp)
- FUNC_EXIT()
- ret
-
-L(gt2):
-L(n3): mov %rax, %r10
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, 8(rp)
- mul %rax
- mov 16(up), %rcx
- mov %rax, 16(rp)
- mov %rcx, %rax
- mov %rdx, 24(rp)
- mul %rax
- mov %rax, 32(rp)
- mov %rdx, 40(rp)
-
- mov %r11, %rax
- mul %r10
- mov %rax, %r8
- mov %rcx, %rax
- mov %rdx, %r9
- mul %r10
- xor %r10, %r10
- add %rax, %r9
- mov %r11, %rax
- mov %r10, %r11
- adc %rdx, %r10
-
- mul %rcx
- add %rax, %r10
- adc %r11, %rdx
- add %r8, %r8
- adc %r9, %r9
- adc %r10, %r10
- adc %rdx, %rdx
- adc %r11, %r11
- add %r8, 8(rp)
- adc %r9, 16(rp)
- adc %r10, 24(rp)
- adc %rdx, 32(rp)
- adc %r11, 40(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/core2/sublsh1_n.asm b/gmp/mpn/x86_64/core2/sublsh1_n.asm
deleted file mode 100644
index 46488fcafe..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh1_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-define(ADDSUB, sub)
-define(ADCSBB, sbb)
-define(func, mpn_sublsh1_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh1_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp/mpn/x86_64/core2/sublsh2_n.asm b/gmp/mpn/x86_64/core2/sublsh2_n.asm
deleted file mode 100644
index f3b1e28464..0000000000
--- a/gmp/mpn/x86_64/core2/sublsh2_n.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-define(ADDSUB, sub)
-define(ADCSBB, sbb)
-define(func, mpn_sublsh2_n)
-
-MULFUNC_PROLOGUE(mpn_sublsh2_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp/mpn/x86_64/coreihwl/addmul_2.asm b/gmp/mpn/x86_64/coreihwl/addmul_2.asm
deleted file mode 100644
index 54aebc888d..0000000000
--- a/gmp/mpn/x86_64/coreihwl/addmul_2.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl AMD64 mpn_addmul_2 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.15
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vp', `%rcx')
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-define(`X0', `%r12')
-define(`X1', `%r13')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov n_param, n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov (up), %rdx
- lea 16(up), up
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov -8(up), %rdx
- lea 16(rp), rp
- jmp L(lo0)
-
-L(b10): mov (up), %rdx
- inc n
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- mov X0, (rp)
- mov 16(rp), X0
- add %rax, X1
- adc $0, w2
- xor w0, w0
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- test $2, R8(n_param)
- jnz L(b11)
-
-L(b01): mov (up), %rdx
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mov 16(rp), X1
- mulx( v0, %rax, w1)
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): mov (up), %rdx
- inc n
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov X1, (rp)
- mov 8(up), %rdx
- mulx( v0, %rax, w1)
- lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
-L(lo2): mov 8(up), %rdx
- lea 32(up), up
- dec n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
-
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm
deleted file mode 100644
index fd5a26d00f..0000000000
--- a/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm
+++ /dev/null
@@ -1,198 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.32
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Handle small n separately, for lower overhead.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rbp')
-define(`v0', `%rdx')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`ADCSBB', `adc')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`ADCSBB', `sbb')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov n_param, n
- mov v0_param, v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): shr $2, n
- jc L(b10)
-
-L(b00): mulx( (up), %r13, %r12)
- mulx( 8,(up), %rbx, %rax)
- add %r12, %rbx
- adc $0, %rax
- mov (rp), %r12
- mov 8(rp), %rcx
- mulx( 16,(up), %r9, %r8)
- lea -16(rp), rp
- lea 16(up), up
- ADDSUB %r13, %r12
- jmp L(lo0)
-
-L(bx1): shr $2, n
- jc L(b11)
-
-L(b01): mulx( (up), %r11, %r10)
- jnz L(gt1)
-L(n1): ADDSUB %r11, (rp)
- mov $0, R32(%rax)
- adc %r10, %rax
- jmp L(ret)
-
-L(gt1): mulx( 8,(up), %r13, %r12)
- mulx( 16,(up), %rbx, %rax)
- lea 24(up), up
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (rp), %r10
- mov 8(rp), %r12
- mov 16(rp), %rcx
- lea -8(rp), rp
- ADDSUB %r11, %r10
- jmp L(lo1)
-
-L(b11): mulx( (up), %rbx, %rax)
- mov (rp), %rcx
- mulx( 8,(up), %r9, %r8)
- lea 8(up), up
- lea -24(rp), rp
- inc n C adjust n
- ADDSUB %rbx, %rcx
- jmp L(lo3)
-
-L(b10): mulx( (up), %r9, %r8)
- mulx( 8,(up), %r11, %r10)
- lea -32(rp), rp
- mov $0, R32(%rax)
- clc C clear cf
- jz L(end) C depends on old shift
-
- ALIGN(16)
-L(top): adc %rax, %r9
- lea 32(rp), rp
- adc %r8, %r11
- mulx( 16,(up), %r13, %r12)
- mov (rp), %r8
- mulx( 24,(up), %rbx, %rax)
- lea 32(up), up
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 8(rp), %r10
- mov 16(rp), %r12
- ADDSUB %r9, %r8
- mov 24(rp), %rcx
- mov %r8, (rp)
- ADCSBB %r11, %r10
-L(lo1): mulx( (up), %r9, %r8)
- mov %r10, 8(rp)
- ADCSBB %r13, %r12
-L(lo0): mov %r12, 16(rp)
- ADCSBB %rbx, %rcx
-L(lo3): mulx( 8,(up), %r11, %r10)
- mov %rcx, 24(rp)
- dec n
- jnz L(top)
-
-L(end): adc %rax, %r9
- adc %r8, %r11
- mov 32(rp), %r8
- mov %r10, %rax
- adc $0, %rax
- mov 40(rp), %r10
- ADDSUB %r9, %r8
- mov %r8, 32(rp)
- ADCSBB %r11, %r10
- mov %r10, 40(rp)
- adc $0, %rax
-
-L(ret): pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp/mpn/x86_64/coreihwl/gmp-mparam.h
deleted file mode 100644
index eef44b3a81..0000000000
--- a/gmp/mpn/x86_64/coreihwl/gmp-mparam.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2900 MHz Core i5 Haswell */
-/* FFT tuning limit = 75000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 25
-
-#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 195
-#define MUL_TOOM6H_THRESHOLD 298
-#define MUL_TOOM8H_THRESHOLD 406
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 117
-#define SQR_TOOM4_THRESHOLD 336
-#define SQR_TOOM6_THRESHOLD 426
-#define SQR_TOOM8_THRESHOLD 562
-
-#define MULMID_TOOM42_THRESHOLD 42
-
-#define MULMOD_BNM1_THRESHOLD 13
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
- { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \
- { 167,11}, { 95,10}, { 191, 9}, { 383,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
- { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
- { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \
- { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 607,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,11}, \
- { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
- { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,11}, { 703,10}, { 1407,11}, { 735,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
- { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
- { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
- { 447,12}, { 959,11}, { 1919,13}, { 511,12}, \
- { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
- { 1343,13}, { 703,12}, { 1407,11}, { 2815,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1727,13}, { 959,12}, { 1919,14}, { 511,13}, \
- { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \
- { 1215,12}, { 2431,14}, { 639,13}, { 1279,12}, \
- { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
- { 2815,13}, { 1471,12}, { 2943,14}, { 767,13}, \
- { 1535,12}, { 3071,13}, { 1727,14}, { 895,13}, \
- { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
- { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
- { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \
- { 2175,13}, { 4351,14}, { 2431,13}, { 4863,15}, \
- { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
- { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
- { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 237
-#define MUL_FFT_THRESHOLD 4224
-
-#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 344, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \
- { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
- { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \
- { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \
- { 767,11}, { 207,10}, { 415,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543, 9}, { 1087,10}, { 575,11}, { 303,10}, \
- { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
- { 735,11}, { 383,10}, { 767,11}, { 415,10}, \
- { 831,11}, { 447,10}, { 895,11}, { 479,13}, \
- { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
- { 607,10}, { 1215,11}, { 671,12}, { 351,11}, \
- { 735,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
- { 479,14}, { 127,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 607,11}, { 1215,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 831,13}, { 447,12}, { 959,13}, \
- { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
- { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
- { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
- { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
- { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
- { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
- { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \
- { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
- { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \
- { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
- { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
- { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
- { 4223,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 206
-#define SQR_FFT_THRESHOLD 3712
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 78
-#define MULLO_MUL_N_THRESHOLD 8207
-
-#define DC_DIV_QR_THRESHOLD 63
-#define DC_DIVAPPR_Q_THRESHOLD 195
-#define DC_BDIV_QR_THRESHOLD 56
-#define DC_BDIV_Q_THRESHOLD 128
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 199
-#define INV_APPR_THRESHOLD 181
-
-#define BINV_NEWTON_THRESHOLD 236
-#define REDC_1_TO_REDC_2_THRESHOLD 47
-#define REDC_2_TO_REDC_N_THRESHOLD 62
-
-#define MU_DIV_QR_THRESHOLD 1470
-#define MU_DIVAPPR_Q_THRESHOLD 1589
-#define MUPI_DIV_QR_THRESHOLD 78
-#define MU_BDIV_QR_THRESHOLD 1442
-#define MU_BDIV_Q_THRESHOLD 1470
-
-#define POWM_SEC_TABLE 3,22,194,257,1099
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 112
-#define HGCD_APPR_THRESHOLD 52
-#define HGCD_REDUCE_THRESHOLD 2681
-#define GCD_DC_THRESHOLD 807
-#define GCDEXT_DC_THRESHOLD 416
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 1326
-#define SET_STR_PRECOMPUTE_THRESHOLD 2627
-
-#define FAC_DSC_THRESHOLD 767
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/x86_64/coreihwl/mul_1.asm b/gmp/mpn/x86_64/coreihwl/mul_1.asm
deleted file mode 100644
index 1e3c338f4e..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_1.asm
+++ /dev/null
@@ -1,155 +0,0 @@
-dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bd1 n/a
-C AMD bd2 ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel PNR n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 1.57 this
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rbp')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_1)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
-
- mov n_param, n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n_param)
- mov v0_param, v0
- jnz L(b10)
-
-L(b00): mulx( (up), %r9, %r8)
- mulx( 8,(up), %r11, %r10)
- mulx( 16,(up), %rcx, %r12)
- lea -32(rp), rp
- jmp L(lo0)
-
-L(b10): mulx( (up), %rcx, %r12)
- mulx( 8,(up), %rbx, %rax)
- lea -16(rp), rp
- test n, n
- jz L(cj2)
- mulx( 16,(up), %r9, %r8)
- lea 16(up), up
- jmp L(lo2)
-
-L(bx1): test $2, R8(n_param)
- mov v0_param, v0
- jnz L(b11)
-
-L(b01): mulx( (up), %rbx, %rax)
- lea -24(rp), rp
- test n, n
- jz L(cj1)
- mulx( 8,(up), %r9, %r8)
- lea 8(up), up
- jmp L(lo1)
-
-L(b11): mulx( (up), %r11, %r10)
- mulx( 8,(up), %rcx, %r12)
- mulx( 16,(up), %rbx, %rax)
- lea -8(rp), rp
- test n, n
- jz L(cj3)
- lea 24(up), up
- jmp L(lo3)
-
- ALIGN(32)
-L(top): lea 32(rp), rp
- mov %r9, (rp)
- adc %r8, %r11
-L(lo3): mulx( (up), %r9, %r8)
- mov %r11, 8(rp)
- adc %r10, %rcx
-L(lo2): mov %rcx, 16(rp)
- adc %r12, %rbx
-L(lo1): mulx( 8,(up), %r11, %r10)
- adc %rax, %r9
- mulx( 16,(up), %rcx, %r12)
- mov %rbx, 24(rp)
-L(lo0): mulx( 24,(up), %rbx, %rax)
- lea 32(up), up
- dec n
- jnz L(top)
-
-L(end): lea 32(rp), rp
- mov %r9, (rp)
- adc %r8, %r11
-L(cj3): mov %r11, 8(rp)
- adc %r10, %rcx
-L(cj2): mov %rcx, 16(rp)
- adc %r12, %rbx
-L(cj1): mov %rbx, 24(rp)
- adc $0, %rax
-
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreihwl/mul_2.asm b/gmp/mpn/x86_64/coreihwl/mul_2.asm
deleted file mode 100644
index 5bdb1aa645..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_2.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl AMD64 mpn_mul_2 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 1.86
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Move test and jcc together, for insn fusion.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vp', `%rcx')
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea 3(n_param), n
- shr $2, n
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): xor w0, w0
- test $2, R8(n_param)
- mov (up), %rdx
- mulx( v0, w2, w1)
- jz L(lo0)
-
-L(b10): lea -16(rp), rp
- lea -16(up), up
- jmp L(lo2)
-
-L(bx1): xor w2, w2
- test $2, R8(n_param)
- mov (up), %rdx
- mulx( v0, w0, w3)
- jnz L(b11)
-
-L(b01): lea -24(rp), rp
- lea 8(up), up
- jmp L(lo1)
-
-L(b11): lea -8(rp), rp
- lea -8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v1, %rax, w0)
- add %rax, w2 C 0
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 1
- add %rax, w2 C 0
- adc $0, w1 C 1
- add w3, w2 C 0
-L(lo0): mov w2, (rp) C 0
- adc $0, w1 C 1
- mulx( v1, %rax, w2)
- add %rax, w0 C 1
- mov 8(up), %rdx
- adc $0, w2 C 2
- mulx( v0, %rax, w3)
- add %rax, w0 C 1
- adc $0, w3 C 2
- add w1, w0 C 1
-L(lo3): mov w0, 8(rp) C 1
- adc $0, w3 C 2
- mulx( v1, %rax, w0)
- add %rax, w2 C 2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 3
- add %rax, w2 C 2
- adc $0, w1 C 3
- add w3, w2 C 2
-L(lo2): mov w2, 16(rp) C 2
- adc $0, w1 C 3
- mulx( v1, %rax, w2)
- add %rax, w0 C 3
- mov 24(up), %rdx
- adc $0, w2 C 4
- mulx( v0, %rax, w3)
- add %rax, w0 C 3
- adc $0, w3 C 4
- add w1, w0 C 3
- lea 32(up), up
-L(lo1): mov w0, 24(rp) C 3
- adc $0, w3 C 4
- dec n
- lea 32(rp), rp
- jnz L(top)
-
-L(end): mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp/mpn/x86_64/coreihwl/mul_basecase.asm
deleted file mode 100644
index b2656c8e9b..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mul_basecase.asm
+++ /dev/null
@@ -1,441 +0,0 @@
-dnl AMD64 mpn_mul_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9 n/a n/a - n/a
-C AMD K10 n/a n/a - n/a
-C AMD bull n/a n/a - n/a
-C AMD pile n/a n/a - n/a
-C AMD steam ? ? - ?
-C AMD bobcat n/a n/a - n/a
-C AMD jaguar ? ? - ?
-C Intel P4 n/a n/a - n/a
-C Intel core n/a n/a - n/a
-C Intel NHM n/a n/a - n/a
-C Intel SBR n/a n/a - n/a
-C Intel IBR n/a n/a - n/a
-C Intel HWL 1.77 1.86 - 2.15
-C Intel BWL ? ? - ?
-C Intel atom n/a n/a - n/a
-C VIA nano n/a n/a - n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Adjoin a mul_3.
-C * Further micro-optimise.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-define(`vp', `%rcx')
-define(`vn', `%r8')
-
-define(`un', `%rbx')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`n', `%rbp')
-define(`v0', `%r9')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- mov un_param, un C free up rdx
- neg un
-
- mov un_param, n C FIXME: share
- sar $2, n C FIXME: share
-
- test $1, R8(vn)
- jz L(do_mul_2)
-
-define(`w4', `%r9')
-define(`w5', `%r14')
-
- mov (vp), %rdx
-
-L(do_mul_1):
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):test $2, R8(un)
- jnz L(m110)
-
-L(m100):
- mulx( (up), w5, w2)
- mulx( 8,(up), w1, w3)
- lea -24(rp), rp
- jmp L(m1l0)
-
-L(m110):
- mulx( (up), w3, w4)
- mulx( 8,(up), w1, w5)
- lea -8(rp), rp
- test n, n
- jz L(cj2)
- mulx( 16,(up), w0, w2)
- lea 16(up), up
- jmp L(m1l2)
-
-L(m1x1):test $2, R8(un)
- jz L(m111)
-
-L(m101):
- mulx( (up), w4, w5)
- lea -16(rp), rp
- test n, n
- jz L(cj1)
- mulx( 8,(up), w0, w2)
- lea 8(up), up
- jmp L(m1l1)
-
-L(m111):
- mulx( (up), w2, w3)
- mulx( 8,(up), w0, w4)
- mulx( 16,(up), w1, w5)
- lea 24(up), up
- test n, n
- jnz L(gt3)
- add w0, w3
- jmp L(cj3)
-L(gt3): add w0, w3
- jmp L(m1l3)
-
- ALIGN(32)
-L(m1tp):lea 32(rp), rp
-L(m1l3):mov w2, (rp)
- mulx( (up), w0, w2)
-L(m1l2):mov w3, 8(rp)
- adc w1, w4
-L(m1l1):adc w0, w5
- mov w4, 16(rp)
- mulx( 8,(up), w1, w3)
-L(m1l0):mov w5, 24(rp)
- mulx( 16,(up), w0, w4)
- adc w1, w2
- mulx( 24,(up), w1, w5)
- adc w0, w3
- lea 32(up), up
- dec n
- jnz L(m1tp)
-
-L(m1ed):lea 32(rp), rp
-L(cj3): mov w2, (rp)
-L(cj2): mov w3, 8(rp)
- adc w1, w4
-L(cj1): mov w4, 16(rp)
- adc $0, w5
- mov w5, 24(rp)
-
- dec R32(vn)
- jz L(ret5)
-
- lea 8(vp), vp
- lea 32(rp), rp
-C push %r12
-C push %r13
-C push %r14
- jmp L(do_addmul)
-
-L(do_mul_2):
-define(`v1', `%r14')
-C push %r12
-C push %r13
-C push %r14
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea (un), n
- sar $2, n
-
- test $1, R8(un)
- jnz L(m2x1)
-
-L(m2x0):xor w0, w0
- test $2, R8(un)
- mov (up), %rdx
- mulx( v0, w2, w1)
- jz L(m2l0)
-
-L(m210):lea -16(rp), rp
- lea -16(up), up
- jmp L(m2l2)
-
-L(m2x1):xor w2, w2
- test $2, R8(un)
- mov (up), %rdx
- mulx( v0, w0, w3)
- jz L(m211)
-
-L(m201):lea -24(rp), rp
- lea 8(up), up
- jmp L(m2l1)
-
-L(m211):lea -8(rp), rp
- lea -8(up), up
- jmp L(m2l3)
-
- ALIGN(16)
-L(m2tp):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2l0):mov w2, (rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 8(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
-L(m2l3):mov w0, 8(rp)
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, w2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2l2):mov w2, 16(rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 24(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
- lea 32(up), up
-L(m2l1):mov w0, 24(rp)
- adc $0, w3
- inc n
- lea 32(rp), rp
- jnz L(m2tp)
-
-L(m2ed):mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
- mov %rax, 8(rp)
-
- add $-2, R32(vn)
- jz L(ret5)
- lea 16(vp), vp
- lea 16(rp), rp
-
-
-L(do_addmul):
- push %r15
- push vn C save vn in new stack slot
-define(`vn', `(%rsp)')
-define(`X0', `%r14')
-define(`X1', `%r15')
-define(`v1', `%r8')
-
- lea (rp,un,8), rp
- lea (up,un,8), up
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
-
- lea 2(un), n
- sar $2, n
-
- mov (up), %rdx
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- test $2, R8(un)
- jnz L(b10)
-
-L(b00): lea 16(up), up
- lea 16(rp), rp
- jmp L(lo0)
-
-L(b10): mov 16(rp), X0
- lea 32(up), up
- mulx( v0, %rax, w3)
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(un)
- jz L(b11)
-
-L(b01): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo2): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
-
- addl $-2, vn
- lea 16(vp), vp
- lea -16(up,un,8), up
- lea 32(rp,un,8), rp
- jnz L(outer)
-
- pop %rax C deallocate vn slot
- pop %r15
-L(ret5):pop %r14
-L(ret4):pop %r13
-L(ret3):pop %r12
-L(ret2):pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm
deleted file mode 100644
index 9986e8bcfa..0000000000
--- a/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2
-C AMD K8,K9 n/a n/a
-C AMD K10 n/a n/a
-C AMD bull n/a n/a
-C AMD pile n/a n/a
-C AMD steam ? ?
-C AMD bobcat n/a n/a
-C AMD jaguar ? ?
-C Intel P4 n/a n/a
-C Intel core n/a n/a
-C Intel NHM n/a n/a
-C Intel SBR n/a n/a
-C Intel IBR n/a n/a
-C Intel HWL 1.86 2.15
-C Intel BWL ? ?
-C Intel atom n/a n/a
-C VIA nano n/a n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Implement proper cor2, replacing current cor0.
-C * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n', `%rcx')
-
-define(`vp', `%r8')
-define(`X0', `%r14')
-define(`X1', `%r15')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`i', `%rbp')
-define(`v0', `%r9')
-define(`v1', `%rbx')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
-
- mov vp_param, vp
- mov (up), %rdx
-
- cmp $4, n
- jb L(small)
-
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea 2(n), i
- shr $2, i
- neg n
- add $2, n
-
- push up C put entry `up' on stack
-
- test $1, R8(n)
- jnz L(m2x1)
-
-L(m2x0):mulx( v0, w0, w3)
- xor R32(w2), R32(w2)
- test $2, R8(n)
- jz L(m2b2)
-
-L(m2b0):lea -8(rp), rp
- lea -8(up), up
- jmp L(m2e0)
-
-L(m2b2):lea -24(rp), rp
- lea 8(up), up
- jmp L(m2e2)
-
-L(m2x1):mulx( v0, w2, w1)
- xor R32(w0), R32(w0)
- test $2, R8(n)
- jnz L(m2b3)
-
-L(m2b1):jmp L(m2e1)
-
-L(m2b3):lea -16(rp), rp
- lea -16(up), up
- jmp L(m2e3)
-
- ALIGN(16)
-L(m2tp):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2e1):mov w2, (rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 8(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
-L(m2e0):mov w0, 8(rp)
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, w2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0
- add %rax, w2
- adc $0, w1
- add w3, w2
-L(m2e3):mov w2, 16(rp)
- adc $0, w1
- mulx( v1, %rax, w2)
- add %rax, w0
- mov 24(up), %rdx
- adc $0, w2
- mulx( v0, %rax, w3)
- add %rax, w0
- adc $0, w3
- add w1, w0
- lea 32(up), up
-L(m2e2):mov w0, 24(rp)
- adc $0, w3
- dec i
- lea 32(rp), rp
- jnz L(m2tp)
-
-L(m2ed):mulx( v1, %rax, w0)
- add %rax, w2
- mov (up), %rdx
- mulx( v0, %rax, w1)
- add w2, %rax
- add w3, %rax
- mov %rax, (rp)
-
- mov (%rsp), up C restore `up' to beginning
- lea 16(vp), vp
- lea 8(rp,n,8), rp C put back rp to old rp + 2
- add $2, n
- jge L(cor1)
-
- push %r14
- push %r15
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
-
- lea (n), i
- sar $2, i
-
- mov (up), %rdx
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- mov 8(up), %rdx
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(n)
- jz L(b2)
-
-L(b0): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo0)
-
-L(b2): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo2)
-
-L(bx1): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- mov X0, (rp)
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- test $2, R8(n)
- jnz L(b3)
-
-L(b1): lea 16(up), up
- lea 16(rp), rp
- jmp L(lo1)
-
-L(b3): mov 16(rp), X0
- lea 32(up), up
- mulx( v0, %rax, w3)
- inc i
- jz L(cj3)
- jmp L(lo3)
-
- ALIGN(16)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo3): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo2): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo1): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo0): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc i
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(cj3): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- add w1, X1
- mov -16(up), %rdx
- mov X1, 8(rp)
- adc $0, w3
- add w2, X0
- mulx( v0, %rax, w1)
- add X0, %rax
- add w3, %rax
- mov %rax, 16(rp)
-
- mov 16(%rsp), up C restore `up' to beginning
- lea 16(vp), vp
- lea 24(rp,n,8), rp C put back rp to old rp + 2
- add $2, n
- jl L(outer)
-
- pop %r15
- pop %r14
-
- jnz L(cor0)
-
-L(cor1):mov (vp), v0
- mov 8(vp), v1
- mov (up), %rdx
- mulx( v0, %r12, %rbp) C u0 x v2
- add (rp), %r12 C FIXME: rp[0] still available in reg?
- adc %rax, %rbp
- mov 8(up), %r10
- imul v0, %r10
- imul v1, %rdx
- mov %r12, (rp)
- add %r10, %rdx
- add %rbp, %rdx
- mov %rdx, 8(rp)
- pop %rax C deallocate `up' copy
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(cor0):mov (vp), %r11
- imul (up), %r11
- add %rax, %r11
- mov %r11, (rp)
- pop %rax C deallocate `up' copy
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- cmp $2, n
- jae L(gt1)
-L(n1): imul (vp), %rdx
- mov %rdx, (rp)
- FUNC_EXIT()
- ret
-L(gt1): ja L(gt2)
-L(n2): mov (vp), %r9
- mulx( %r9, %rax, %rdx)
- mov %rax, (rp)
- mov 8(up), %rax
- imul %r9, %rax
- add %rax, %rdx
- mov 8(vp), %r9
- mov (up), %rcx
- imul %r9, %rcx
- add %rcx, %rdx
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-L(gt2):
-L(n3): mov (vp), %r9
- mulx( %r9, %rax, %r10) C u0 x v0
- mov %rax, (rp)
- mov 8(up), %rdx
- mulx( %r9, %rax, %rdx) C u1 x v0
- imul 16(up), %r9 C u2 x v0
- add %rax, %r10
- adc %rdx, %r9
- mov 8(vp), %r11
- mov (up), %rdx
- mulx( %r11, %rax, %rdx) C u0 x v1
- add %rax, %r10
- adc %rdx, %r9
- imul 8(up), %r11 C u1 x v1
- add %r11, %r9
- mov %r10, 8(rp)
- mov 16(vp), %r10
- mov (up), %rax
- imul %rax, %r10 C u0 x v2
- add %r10, %r9
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/redc_1.asm b/gmp/mpn/x86_64/coreihwl/redc_1.asm
deleted file mode 100644
index f1a475e53c..0000000000
--- a/gmp/mpn/x86_64/coreihwl/redc_1.asm
+++ /dev/null
@@ -1,433 +0,0 @@
-dnl AMD64 mpn_redc_1 optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile n/a
-C AMD steam ?
-C AMD bobcat n/a
-C AMD jaguar ?
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR n/a
-C Intel IBR n/a
-C Intel HWL 2.32
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise.
-C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2
-C cycles, though.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv_param', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%rdi')
-define(`u0inv', `(%rsp)') C stack
-
-ABI_SUPPORT(DOS64) C FIXME: needs verification
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- push rp
- mov mp_param, mp C note that rp and mp shares register
- mov (up), %rdx
-
- neg n
- push %r8 C put u0inv on stack
- imul u0inv_param, %rdx C first iteration q0
- mov n, j C outer loop induction var
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n)
- jz L(o0b)
-
- cmp $-2, R32(n)
- jnz L(o2)
-
-C Special code for n = 2 since general code cannot handle it
- mov 8(%rsp), %rbx C rp
- lea 16(%rsp), %rsp C deallocate two slots
- mulx( (mp), %r9, %r12)
- mulx( 8,(mp), %r11, %r10)
- add %r12, %r11
- adc $0, %r10
- add (up), %r9 C = 0
- adc 8(up), %r11 C r11 = up[1]
- adc $0, %r10 C -> up[0]
- mov %r11, %rdx
- imul u0inv_param, %rdx
- mulx( (mp), %r13, %r12)
- mulx( 8,(mp), %r14, %r15)
- xor R32(%rax), R32(%rax)
- add %r12, %r14
- adc $0, %r15
- add %r11, %r13 C = 0
- adc 16(up), %r14 C rp[2]
- adc $0, %r15 C -> up[1]
- add %r14, %r10
- adc 24(up), %r15
- mov %r10, (%rbx)
- mov %r15, 8(%rbx)
- setc R8(%rax)
- jmp L(ret)
-
-L(o2): lea 2(n), i C inner loop induction var
- mulx( (mp), %r9, %r8)
- mulx( 8,(mp), %r11, %r10)
- sar $2, i
- add %r8, %r11
- jmp L(lo2)
-
- ALIGN(16)
-L(tp2): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
-L(lo2): mulx( 16,(mp), %r13, %r12)
- mov (up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 8(up), %r10
- mov 16(up), %r12
- add %r9, %r8
- mov 24(up), %rbp
- mov %r8, (up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, 8(up)
- adc %r13, %r12
- mov %r12, 16(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 24(up)
- inc i
- jnz L(tp2)
-
-L(ed2): mov 56(up,n,8), %rdx C next iteration up[0]
- lea 16(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 32(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 40(up), %rax
- add %r9, %r8
- mov %r8, 32(up)
- adc %r11, %rax
- mov %rax, 40(up)
- lea 56(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o2)
-
- jmp L(cj)
-
-
-L(bx1): test $2, R8(n)
- jz L(o3a)
-
-L(o1a): cmp $-1, R32(n)
- jnz L(o1b)
-
-C Special code for n = 1 since general code cannot handle it
- mov 8(%rsp), %rbx C rp
- lea 16(%rsp), %rsp C deallocate two slots
- mulx( (mp), %r11, %r10)
- add (up), %r11
- adc 8(up), %r10
- mov %r10, (%rbx)
- mov $0, R32(%rax)
- setc R8(%rax)
- jmp L(ret)
-
-L(o1b): lea 24(mp), mp
-L(o1): lea 1(n), i C inner loop induction var
- mulx( -24,(mp), %r11, %r10)
- mulx( -16,(mp), %r13, %r12)
- mulx( -8,(mp), %rbx, %rax)
- sar $2, i
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (up), %r10
- mov 8(up), %r12
- mov 16(up), %rbp
- add %r11, %r10
- jmp L(lo1)
-
- ALIGN(16)
-L(tp1): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov -8(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov (up), %r10
- mov 8(up), %r12
- add %r9, %r8
- mov 16(up), %rbp
- mov %r8, -8(up)
- adc %r11, %r10
-L(lo1): mulx( (mp), %r9, %r8)
- mov %r10, (up)
- adc %r13, %r12
- mov %r12, 8(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 16(up)
- inc i
- jnz L(tp1)
-
-L(ed1): mov 48(up,n,8), %rdx C next iteration up[0]
- lea 40(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 24(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 32(up), %rax
- add %r9, %r8
- mov %r8, 24(up)
- adc %r11, %rax
- mov %rax, 32(up)
- lea 48(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o1)
-
- jmp L(cj)
-
-L(o3a): cmp $-3, R32(n)
- jnz L(o3b)
-
-C Special code for n = 3 since general code cannot handle it
-L(n3): mulx( (mp), %rbx, %rax)
- mulx( 8,(mp), %r9, %r14)
- add (up), %rbx
- mulx( 16,(mp), %r11, %r10)
- adc %rax, %r9 C W 1
- adc %r14, %r11 C W 2
- mov 8(up), %r14
- mov u0inv_param, %rdx
- adc $0, %r10 C W 3
- mov 16(up), %rax
- add %r9, %r14 C W 1
- mov %r14, 8(up)
- mulx( %r14, %rdx, %r13) C next iteration q0
- adc %r11, %rax C W 2
- mov %rax, 16(up)
- adc $0, %r10 C W 3
- mov %r10, (up)
- lea 8(up), up C up = (last starting up) + 1
- inc j
- jnz L(n3)
-
- jmp L(cj)
-
-L(o3b): lea 8(mp), mp
-L(o3): lea 4(n), i C inner loop induction var
- mulx( -8,(mp), %rbx, %rax)
- mulx( (mp), %r9, %r8)
- mov (up), %rbp
- mulx( 8,(mp), %r11, %r10)
- sar $2, i
- add %rbx, %rbp
- nop
- adc %rax, %r9
- jmp L(lo3)
-
- ALIGN(16)
-L(tp3): adc %rax, %r9
- lea 32(up), up
-L(lo3): adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov 8(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov 16(up), %r10
- mov 24(up), %r12
- add %r9, %r8
- mov 32(up), %rbp
- mov %r8, 8(up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, 16(up)
- adc %r13, %r12
- mov %r12, 24(up)
- adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 32(up)
- inc i
- jnz L(tp3)
-
-L(ed3): mov 64(up,n,8), %rdx C next iteration up[0]
- lea 24(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 40(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 48(up), %rax
- add %r9, %r8
- mov %r8, 40(up)
- adc %r11, %rax
- mov %rax, 48(up)
- lea 64(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o3)
-
- jmp L(cj)
-
-L(o0b): lea 16(mp), mp
-L(o0): mov n, i C inner loop induction var
- mulx( -16,(mp), %r13, %r12)
- mulx( -8,(mp), %rbx, %rax)
- sar $2, i
- add %r12, %rbx
- adc $0, %rax
- mov (up), %r12
- mov 8(up), %rbp
- mulx( (mp), %r9, %r8)
- add %r13, %r12
- jmp L(lo0)
-
- ALIGN(16)
-L(tp0): adc %rax, %r9
- lea 32(up), up
- adc %r8, %r11
- mulx( 16,(mp), %r13, %r12)
- mov -16(up), %r8
- mulx( 24,(mp), %rbx, %rax)
- lea 32(mp), mp
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- mov -8(up), %r10
- mov (up), %r12
- add %r9, %r8
- mov 8(up), %rbp
- mov %r8, -16(up)
- adc %r11, %r10
- mulx( (mp), %r9, %r8)
- mov %r10, -8(up)
- adc %r13, %r12
- mov %r12, (up)
-L(lo0): adc %rbx, %rbp
- mulx( 8,(mp), %r11, %r10)
- mov %rbp, 8(up)
- inc i
- jnz L(tp0)
-
-L(ed0): mov 40(up,n,8), %rdx C next iteration up[0]
- lea 32(mp,n,8), mp C mp = (last starting mp)
- adc %rax, %r9
- adc %r8, %r11
- mov 16(up), %r8
- adc $0, %r10
- imul u0inv, %rdx C next iteration q0
- mov 24(up), %rax
- add %r9, %r8
- mov %r8, 16(up)
- adc %r11, %rax
- mov %rax, 24(up)
- lea 40(up,n,8), up C up = (last starting up) + 1
- adc $0, %r10
- mov %r10, -8(up)
- inc j
- jnz L(o0)
-
-L(cj):
-IFSTD(` mov 8(%rsp), %rdi C param 1: rp
- lea 16(%rsp), %rsp C deallocate two slots
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` mov up, %rdx C param 2: up
- lea (up,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov 8(%rsp), %rcx C param 1: rp
- lea 16(%rsp), %rsp ') C deallocate two slots
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm
deleted file mode 100644
index 641cdf349a..0000000000
--- a/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm
+++ /dev/null
@@ -1,506 +0,0 @@
-dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
-C AMD K8,K9 n/a n/a n/a
-C AMD K10 n/a n/a n/a
-C AMD bull n/a n/a n/a
-C AMD pile n/a n/a n/a
-C AMD steam ? ? ?
-C AMD bobcat n/a n/a n/a
-C AMD jaguar ? ? ?
-C Intel P4 n/a n/a n/a
-C Intel core n/a n/a n/a
-C Intel NHM n/a n/a n/a
-C Intel SBR n/a n/a n/a
-C Intel IBR n/a n/a n/a
-C Intel HWL 1.86 2.15 ~2.5
-C Intel BWL ? ? ?
-C Intel atom n/a n/a n/a
-C VIA nano n/a n/a n/a
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
-C that the sqr_diag_addlsh1 loop was manually written.
-
-C TODO
-C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be
-C possible.
-C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for
-C n = 0 (mod 2). These loops could fall into specific "corner" code.
-C * Consider splitting outer loop into 4.
-C * Streamline pointer updates.
-C * Perhaps suppress a few more xor insns in feed-in code.
-C * Make sure we write no dead registers in feed-in code.
-C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
-C out for negative sizes being zero-extended, though.
-C * Provide straight-line code for n = 4; then look for simplifications in
-C main code.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- cmp $2, un_param
- jae L(gt1)
-
- mov (up), %rdx
- mulx( %rdx, %rax, %rdx)
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(gt1): jne L(gt2)
-
- mov (up), %rdx
- mov 8(up), %rcx
- mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
- mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
- mov %rcx, %rdx
- mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
- add %r9, %r9 C W 1
- adc %r10, %r10 C W 2
- adc $0, %rdx C W 3
- add %r9, %r8 C W 1
- adc %r11, %r10 C W 2
- adc $0, %rdx C W 3
- mov %rax, (rp)
- mov %r8, 8(rp)
- mov %r10, 16(rp)
- mov %rdx, 24(rp)
- FUNC_EXIT()
- ret
-
-L(gt2): cmp $4, un_param
- jae L(gt3)
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w2', `%r11')
-
- mov (up), v0
- mov 8(up), %rdx
- mov %rdx, v1
- mulx( v0, w2, %rax)
- mov 16(up), %rdx
- mulx( v0, w0, %rcx)
- mov w2, %r8
- add %rax, w0
- adc $0, %rcx
- mulx( v1, %rdx, %rax)
- add %rcx, %rdx
- mov %rdx, 24(rp)
- adc $0, %rax
- mov %rax, 32(rp)
- xor R32(%rcx), R32(%rcx)
- mov (up), %rdx
- mulx( %rdx, %rax, w2)
- mov %rax, (rp)
- add %r8, %r8
- adc w0, w0
- setc R8(%rcx)
- mov 8(up), %rdx
- mulx( %rdx, %rax, %rdx)
- add w2, %r8
- adc %rax, w0
- mov %r8, 8(rp)
- mov w0, 16(rp)
- mov 24(rp), %r8
- mov 32(rp), w0
- lea (%rdx,%rcx), w2
- adc %r8, %r8
- adc w0, w0
- setc R8(%rcx)
- mov 16(up), %rdx
- mulx( %rdx, %rax, %rdx)
- add w2, %r8
- adc %rax, w0
- mov %r8, 24(rp)
- mov w0, 32(rp)
- adc %rcx, %rdx
- mov %rdx, 40(rp)
- FUNC_EXIT()
- ret
-
-L(gt3):
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%rbx')
-define(`w3', `%rbp')
-define(`un', `%r12')
-define(`n', `%rcx')
-
-define(`X0', `%r13')
-define(`X1', `%r14')
-
-L(do_mul_2):
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- mov $0, R32(un)
- sub un_param, un C free up rdx
- push un
- mov (up), v0
- mov 8(up), %rdx
- lea 2(un), n
- sar $2, n C FIXME: suppress, change loop?
- inc un C decrement |un|
- mov %rdx, v1
-
- test $1, R8(un)
- jnz L(mx1)
-
-L(mx0): mulx( v0, w2, w1)
- mov 16(up), %rdx
- mov w2, 8(rp)
- xor w2, w2
- mulx( v0, w0, w3)
- test $2, R8(un)
- jz L(m00)
-
-L(m10): lea -8(rp), rp
- lea -8(up), up
- jmp L(mlo2)
-
-L(m00): lea 8(up), up
- lea 8(rp), rp
- jmp L(mlo0)
-
-L(mx1): mulx( v0, w0, w3)
- mov 16(up), %rdx
- mov w0, 8(rp)
- xor w0, w0
- mulx( v0, w2, w1)
- test $2, R8(un)
- jz L(mlo3)
-
-L(m01): lea 16(rp), rp
- lea 16(up), up
- jmp L(mlo1)
-
- ALIGN(32)
-L(mtop):mulx( v1, %rax, w0)
- add %rax, w2 C 0
- mov (up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 1
- add %rax, w2 C 0
-L(mlo1):adc $0, w1 C 1
- add w3, w2 C 0
- mov w2, (rp) C 0
- adc $0, w1 C 1
- mulx( v1, %rax, w2)
- add %rax, w0 C 1
- mov 8(up), %rdx
- adc $0, w2 C 2
- mulx( v0, %rax, w3)
- add %rax, w0 C 1
- adc $0, w3 C 2
-L(mlo0):add w1, w0 C 1
- mov w0, 8(rp) C 1
- adc $0, w3 C 2
- mulx( v1, %rax, w0)
- add %rax, w2 C 2
- mov 16(up), %rdx
- mulx( v0, %rax, w1)
- adc $0, w0 C 3
- add %rax, w2 C 2
- adc $0, w1 C 3
-L(mlo3):add w3, w2 C 2
- mov w2, 16(rp) C 2
- adc $0, w1 C 3
- mulx( v1, %rax, w2)
- add %rax, w0 C 3
- mov 24(up), %rdx
- adc $0, w2 C 4
- mulx( v0, %rax, w3)
- add %rax, w0 C 3
- adc $0, w3 C 4
-L(mlo2):add w1, w0 C 3
- lea 32(up), up
- mov w0, 24(rp) C 3
- adc $0, w3 C 4
- inc n
- lea 32(rp), rp
- jnz L(mtop)
-
-L(mend):mulx( v1, %rdx, %rax)
- add %rdx, w2
- adc $0, %rax
- add w3, w2
- mov w2, (rp)
- adc $0, %rax
- mov %rax, 8(rp)
-
- lea 16(up), up
- lea -16(rp), rp
-
-L(do_addmul_2):
-L(outer):
- lea (up,un,8), up C put back up to 2 positions above last time
- lea 48(rp,un,8), rp C put back rp to 4 positions above last time
-
- mov -8(up), v0 C shared between addmul_2 and corner
-
- add $2, un C decrease |un|
- cmp $-2, un
- jge L(corner)
-
- mov (up), v1
-
- lea 1(un), n
- sar $2, n C FIXME: suppress, change loop?
-
- mov v1, %rdx
- test $1, R8(un)
- jnz L(bx1)
-
-L(bx0): mov (rp), X0
- mov 8(rp), X1
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mov X0, (rp)
- xor w2, w2
- test $2, R8(un)
- jnz L(b10)
-
-L(b00): mov 8(up), %rdx
- lea 16(rp), rp
- lea 16(up), up
- jmp L(lo0)
-
-L(b10): mov 8(up), %rdx
- mov 16(rp), X0
- lea 32(up), up
- inc n
- mulx( v0, %rax, w3)
- jz L(ex)
- jmp L(lo2)
-
-L(bx1): mov (rp), X1
- mov 8(rp), X0
- mulx( v0, %rax, w3)
- mov 8(up), %rdx
- add %rax, X1
- adc $0, w3
- xor w0, w0
- mov X1, (rp)
- mulx( v0, %rax, w1)
- test $2, R8(un)
- jz L(b11)
-
-L(b01): mov 16(rp), X1
- lea 24(rp), rp
- lea 24(up), up
- jmp L(lo1)
-
-L(b11): lea 8(rp), rp
- lea 8(up), up
- jmp L(lo3)
-
- ALIGN(32)
-L(top): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(lo2): add %rax, X1
- adc $0, w3
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- lea 32(rp), rp
- add w1, X1
- mov -16(up), %rdx
- mov X1, -24(rp)
- adc $0, w3
- add w2, X0
- mov -8(rp), X1
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo1): add %rax, X0
- mulx( v1, %rax, w2)
- adc $0, w1
- add w3, X0
- mov X0, -16(rp)
- adc $0, w1
- add %rax, X1
- adc $0, w2
- add w0, X1
- mov -8(up), %rdx
- adc $0, w2
-L(lo0): mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mov (rp), X0
- mulx( v1, %rax, w0)
- add %rax, X0
- adc $0, w0
- add w1, X1
- mov X1, -8(rp)
- adc $0, w3
- mov (up), %rdx
- add w2, X0
- mulx( v0, %rax, w1)
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mulx( v1, %rax, w2)
- add w3, X0
- mov 8(rp), X1
- mov X0, (rp)
- mov 16(rp), X0
- adc $0, w1
- add %rax, X1
- adc $0, w2
- mov 8(up), %rdx
- lea 32(up), up
- inc n
- jnz L(top)
-
-L(end): mulx( v0, %rax, w3)
- add w0, X1
- adc $0, w2
-L(ex): add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w2, %rdx
- adc $0, %rax
- add %rdx, w3
- mov w3, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
-
- jmp L(outer) C loop until a small corner remains
-
-L(corner):
- pop un
- mov (up), %rdx
- jg L(small_corner)
-
- mov %rdx, v1
- mov (rp), X0
- mov %rax, X1 C Tricky rax reuse of last iteration
- mulx( v0, %rax, w1)
- add %rax, X0
- adc $0, w1
- mov X0, (rp)
- mov 8(up), %rdx
- mulx( v0, %rax, w3)
- add %rax, X1
- adc $0, w3
- mulx( v1, %rdx, %rax)
- add w1, X1
- mov X1, 8(rp)
- adc $0, w3
- add w3, %rdx
- mov %rdx, 16(rp)
- adc $0, %rax
- mov %rax, 24(rp)
- lea 32(rp), rp
- lea 16(up), up
- jmp L(com)
-
-L(small_corner):
- mulx( v0, X1, w3)
- add %rax, X1 C Tricky rax reuse of last iteration
- adc $0, w3
- mov X1, (rp)
- mov w3, 8(rp)
- lea 16(rp), rp
- lea 8(up), up
-
-L(com):
-
-L(sqr_diag_addlsh1):
- lea 8(up,un,8), up C put back up at its very beginning
- lea (rp,un,8), rp
- lea (rp,un,8), rp C put back rp at its very beginning
- inc un
-
- mov -8(up), %rdx
- xor R32(%rbx), R32(%rbx) C clear CF as side effect
- mulx( %rdx, %rax, %r10)
- mov %rax, 8(rp)
- mov 16(rp), %r8
- mov 24(rp), %r9
- jmp L(dm)
-
- ALIGN(16)
-L(dtop):mov 32(rp), %r8
- mov 40(rp), %r9
- lea 16(rp), rp
- lea (%rdx,%rbx), %r10
-L(dm): adc %r8, %r8
- adc %r9, %r9
- setc R8(%rbx)
- mov (up), %rdx
- lea 8(up), up
- mulx( %rdx, %rax, %rdx)
- add %r10, %r8
- adc %rax, %r9
- mov %r8, 16(rp)
- mov %r9, 24(rp)
- inc un
- jnz L(dtop)
-
-L(dend):adc %rbx, %rdx
- mov %rdx, 32(rp)
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm b/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm
deleted file mode 100644
index eed64e701e..0000000000
--- a/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm
+++ /dev/null
@@ -1,200 +0,0 @@
-dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
-dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
-dnl Optimised for Nehalem.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 4.75
-C Intel P4 ?
-C Intel core2 2.8-3
-C Intel NHM 2.8
-C Intel SBR 3.55
-C Intel atom ?
-C VIA nano ?
-
-C The inner-loop probably runs close to optimally on Nehalem (using 4-way
-C unrolling). The rest of the code is quite crude, and could perhaps be made
-C both smaller and faster.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cnt', `%r8')
-define(`cy', `%r9') C for _nc variant
-
-ifdef(`OPERATION_addlsh_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(IFRSB, )
- define(func_n, mpn_addlsh_n)
- define(func_nc, mpn_addlsh_nc)')
-ifdef(`OPERATION_rsblsh_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(IFRSB, `$1')
- define(func_n, mpn_rsblsh_n)
- define(func_nc, mpn_rsblsh_nc)')
-
-C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
-C refmpn_rsblsh_nc
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ') C cnt
- push %rbx
- xor R32(%rbx), R32(%rbx) C clear CF save register
-L(ent): push %rbp
- mov R32(n), R32(%rbp)
- mov n, %rax
-
- mov R32(cnt), R32(%rcx)
- neg R32(%rcx)
-
- lea -8(up,%rax,8), up
- lea -8(vp,%rax,8), vp
- lea -40(rp,%rax,8), rp
- neg %rax
-
- and $3, R32(%rbp)
- jz L(b0)
- cmp $2, R32(%rbp)
- jc L(b1)
- jz L(b2)
-
-L(b3): xor R32(%r9), R32(%r9)
- mov 8(vp,%rax,8), %r10
- mov 16(vp,%rax,8), %r11
- shrd %cl, %r10, %r9
- shrd %cl, %r11, %r10
- add R32(%rbx), R32(%rbx)
- ADCSBB 8(up,%rax,8), %r9
- mov 24(vp,%rax,8), %r8
- ADCSBB 16(up,%rax,8), %r10
- sbb R32(%rbx), R32(%rbx)
- add $3, %rax
- jmp L(lo3)
-
-L(b0): mov 8(vp,%rax,8), %r9
- xor R32(%r8), R32(%r8)
- shrd %cl, %r9, %r8
- mov 16(vp,%rax,8), %r10
- mov 24(vp,%rax,8), %r11
- shrd %cl, %r10, %r9
- shrd %cl, %r11, %r10
- add R32(%rbx), R32(%rbx)
- ADCSBB 8(up,%rax,8), %r8
- mov %r8, 40(rp,%rax,8) C offset 40
- ADCSBB 16(up,%rax,8), %r9
- mov 32(vp,%rax,8), %r8
- ADCSBB 24(up,%rax,8), %r10
- sbb R32(%rbx), R32(%rbx)
- add $4, %rax
- jmp L(lo0)
-
-L(b1): mov 8(vp,%rax,8), %r8
- add $1, %rax
- jz L(1)
- mov 8(vp,%rax,8), %r9
- xor R32(%rbp), R32(%rbp)
- jmp L(lo1)
-L(1): xor R32(%r11), R32(%r11)
- jmp L(wd1)
-
-L(b2): xor %r10, %r10
- mov 8(vp,%rax,8), %r11
- shrd %cl, %r11, %r10
- add R32(%rbx), R32(%rbx)
- mov 16(vp,%rax,8), %r8
- ADCSBB 8(up,%rax,8), %r10
- sbb R32(%rbx), R32(%rbx)
- add $2, %rax
- jz L(end)
-
- ALIGN(16)
-L(top): mov 8(vp,%rax,8), %r9
- mov %r11, %rbp
-L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
-L(lo1): shrd %cl, %r8, %rbp
- shrd %cl, %r9, %r8
- mov 16(vp,%rax,8), %r10
- mov 24(vp,%rax,8), %r11
- shrd %cl, %r10, %r9
- shrd %cl, %r11, %r10
- add R32(%rbx), R32(%rbx)
- ADCSBB (up,%rax,8), %rbp
- ADCSBB 8(up,%rax,8), %r8
- mov %r8, 40(rp,%rax,8) C offset 40
- ADCSBB 16(up,%rax,8), %r9
- mov 32(vp,%rax,8), %r8
- ADCSBB 24(up,%rax,8), %r10
- sbb R32(%rbx), R32(%rbx)
- add $4, %rax
- mov %rbp, (rp,%rax,8) C offset 32
-L(lo0):
-L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
- jnz L(top)
-
-L(end): mov %r10, 24(rp,%rax,8)
-L(wd1): shrd %cl, %r8, %r11
- add R32(%rbx), R32(%rbx)
- ADCSBB (up,%rax,8), %r11
- mov %r11, 32(rp,%rax,8) C offset 32
- adc R32(%rax), R32(%rax) C rax is zero after loop
- shr R8(%rcx), %r8
- ADDSUB %r8, %rax
-IFRSB( neg %rax)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ') C cnt
-IFDOS(` mov 64(%rsp), %r9 ') C cy
- push %rbx
- neg cy
- sbb R32(%rbx), R32(%rbx) C initialise CF save register
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm b/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm
deleted file mode 100644
index b768905b93..0000000000
--- a/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM 4.55 with minor fluctuations
-C Intel SBR
-C Intel IBR
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimization tool suite written by David Harvey and Torbjorn Granlund.
-
-C N.B.: Be careful if editing, making sure the loop alignment padding does not
-C become large, as we currently fall into it.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%rbx')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
-
- mov (up), %rax
- lea -8(up,n_param,8), up
- mov (rp), %r8
- lea -8(rp,n_param,8), rp
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov $3, R32(n)
- sub n_param, n
- mul v0
- mov $0, R32(%r11)
- mov %r8, %r10
- ADDSUB %rax, %r10
- mov -8(up,n,8), %rax
- adc %rdx, %r11
- jmp L(lo0)
-
-L(b10): mov $1, R32(n)
- sub n_param, n
- mul v0
- mov %r8, %r10
- mov $0, R32(%r11)
- ADDSUB %rax, %r10
- mov 8(up,n,8), %rax
- adc %rdx, %r11
- jmp L(lo2)
-
-L(bx1): test $2, R8(n_param)
- jz L(b01)
-
-L(b11): mov $2, R32(n)
- sub n_param, n
- mul v0
- ADDSUB %rax, %r8
- mov $0, R32(%r9)
- mov (up,n,8), %rax
- adc %rdx, %r9
- jmp L(lo3)
-
-L(b01): mov $0, R32(n)
- sub n_param, n
- xor %r11, %r11
- add $4, n
- jc L(end)
-
- ALIGN(32)
-L(top): mul v0
- ADDSUB %rax, %r8
- mov $0, R32(%r9)
- mov -16(up,n,8), %rax
- adc %rdx, %r9
-L(lo1): mul v0
- ADDSUB %r11, %r8
- mov $0, R32(%r11)
- mov -16(rp,n,8), %r10
- adc $0, %r9
- ADDSUB %rax, %r10
- mov -8(up,n,8), %rax
- adc %rdx, %r11
- mov %r8, -24(rp,n,8)
- ADDSUB %r9, %r10
- adc $0, %r11
-L(lo0): mov -8(rp,n,8), %r8
- mul v0
- ADDSUB %rax, %r8
- mov $0, R32(%r9)
- mov (up,n,8), %rax
- adc %rdx, %r9
- mov %r10, -16(rp,n,8)
- ADDSUB %r11, %r8
- adc $0, %r9
-L(lo3): mul v0
- mov (rp,n,8), %r10
- mov $0, R32(%r11)
- ADDSUB %rax, %r10
- mov 8(up,n,8), %rax
- adc %rdx, %r11
- mov %r8, -8(rp,n,8)
- ADDSUB %r9, %r10
- adc $0, %r11
-L(lo2): mov 8(rp,n,8), %r8
- mov %r10, (rp,n,8)
- add $4, n
- jnc L(top)
-
-L(end): mul v0
- ADDSUB %rax, %r8
- mov $0, R32(%rax)
- adc %rdx, %rax
- ADDSUB %r11, %r8
- adc $0, %rax
- mov %r8, (rp)
-
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreinhm/gmp-mparam.h b/gmp/mpn/x86_64/coreinhm/gmp-mparam.h
deleted file mode 100644
index 6a7c03639f..0000000000
--- a/gmp/mpn/x86_64/coreinhm/gmp-mparam.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2012, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 2667 MHz Core i7 Nehalem */
-/* FFT tuning limit = 100000000 */
-/* Generated by tuneup.c, 2014-03-18, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 16
-
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 60
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 228
-#define MUL_TOOM8H_THRESHOLD 309
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 63
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 250
-#define SQR_TOOM6_THRESHOLD 351
-#define SQR_TOOM8_THRESHOLD 454
-
-#define MULMID_TOOM42_THRESHOLD 28
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 167,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319,12}, { 95,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 207,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
- { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
- { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 447,11}, { 895,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
- { 607,13}, { 319,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
- { 575,12}, { 1215,11}, { 2431,13}, { 639,12}, \
- { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \
- { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
- { 959,14}, { 511,13}, { 1087,12}, { 2175,13}, \
- { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
- { 2687,13}, { 1407,12}, { 2815,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1919,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
- { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
- { 2815,15}, { 767,14}, { 1535,13}, { 3071,14}, \
- { 1663,13}, { 3455,14}, { 1791,13}, { 3583,14}, \
- { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \
- { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
- { 1535,14}, { 3455,15}, { 1791,14}, { 3839,16}, \
- { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
- { 4863,15}, { 2815,14}, { 5887,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 203
-#define MUL_FFT_THRESHOLD 4032
-
-#define SQR_FFT_MODF_THRESHOLD 312 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 312, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 135,11}, { 79,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319, 9}, { 639,12}, \
- { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
- { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511, 9}, { 1023,11}, { 271,10}, { 543,11}, \
- { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
- { 319,10}, { 639, 9}, { 1279,11}, { 351,10}, \
- { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
- { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,12}, { 479,11}, { 959,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
- { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
- { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
- { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \
- { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
- { 2431,14}, { 639,13}, { 1279,12}, { 2559,13}, \
- { 1343,12}, { 2687,13}, { 1407,12}, { 2815,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1791,12}, \
- { 3583,13}, { 1919,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
- { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
- { 767,14}, { 1663,13}, { 3455,14}, { 1791,13}, \
- { 3583,16}, { 511,15}, { 1023,14}, { 2303,13}, \
- { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
- { 2943,13}, { 5887,15}, { 1535,14}, { 3455,15}, \
- { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
- { 4223,15}, { 2303,14}, { 4863,15}, { 2815,14}, \
- { 5887,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 217
-#define SQR_FFT_THRESHOLD 2752
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 45
-#define MULLO_MUL_N_THRESHOLD 8397
-
-#define DC_DIV_QR_THRESHOLD 46
-#define DC_DIVAPPR_Q_THRESHOLD 135
-#define DC_BDIV_QR_THRESHOLD 38
-#define DC_BDIV_Q_THRESHOLD 31
-
-#define INV_MULMOD_BNM1_THRESHOLD 34
-#define INV_NEWTON_THRESHOLD 212
-#define INV_APPR_THRESHOLD 155
-
-#define BINV_NEWTON_THRESHOLD 254
-#define REDC_1_TO_REDC_2_THRESHOLD 32
-#define REDC_2_TO_REDC_N_THRESHOLD 50
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1360
-#define MUPI_DIV_QR_THRESHOLD 85
-#define MU_BDIV_QR_THRESHOLD 1142
-#define MU_BDIV_Q_THRESHOLD 1210
-
-#define POWM_SEC_TABLE 3,46,194,494,1678
-
-#define MATRIX22_STRASSEN_THRESHOLD 21
-#define HGCD_THRESHOLD 141
-#define HGCD_APPR_THRESHOLD 175
-#define HGCD_REDUCE_THRESHOLD 2205
-#define GCD_DC_THRESHOLD 330
-#define GCDEXT_DC_THRESHOLD 361
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 15
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 517
-#define SET_STR_PRECOMPUTE_THRESHOLD 1430
-
-#define FAC_DSC_THRESHOLD 351
-#define FAC_ODD_THRESHOLD 43
diff --git a/gmp/mpn/x86_64/coreinhm/hamdist.asm b/gmp/mpn/x86_64/coreinhm/hamdist.asm
deleted file mode 100644
index 93e1e5632b..0000000000
--- a/gmp/mpn/x86_64/coreinhm/hamdist.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_hamdist -- hamming distance.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_hamdist)
-include_mpn(`x86_64/k10/hamdist.asm')
diff --git a/gmp/mpn/x86_64/coreinhm/popcount.asm b/gmp/mpn/x86_64/coreinhm/popcount.asm
deleted file mode 100644
index 8f22a715b6..0000000000
--- a/gmp/mpn/x86_64/coreinhm/popcount.asm
+++ /dev/null
@@ -1,38 +0,0 @@
-dnl AMD64 mpn_popcount -- population count.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86_64/k10/popcount.asm')
diff --git a/gmp/mpn/x86_64/coreinhm/redc_1.asm b/gmp/mpn/x86_64/coreinhm/redc_1.asm
deleted file mode 100644
index 4d9261d8f9..0000000000
--- a/gmp/mpn/x86_64/coreinhm/redc_1.asm
+++ /dev/null
@@ -1,544 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat ?
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea (mp_param,n,8), mp
- lea (up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(bx0)
-
-L(bx1): test $2, R8(n)
- jz L(b3)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
-
-L(otp1):lea 3(n), i
- mov (mp,n,8), %rax
- mov (up,n,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov 8(mp,n,8), %rax
- adc %rdx, %r9
- mul q0
- mov $0, R32(%r11)
- mov 8(up,n,8), %rbx
- add %rax, %rbx
- mov 16(mp,n,8), %rax
- adc %rdx, %r11
- add %r9, %rbx
- adc $0, %r11
- mov 16(up,n,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov 24(mp,n,8), %rax
- adc %rdx, %r9
- mov %rbx, 8(up,n,8)
- imul u0inv, %rbx C next q limb
- jmp L(e1)
-
- ALIGNx
-L(tp1): mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov -16(mp,i,8), %rax
- adc %rdx, %r9
- mul q0
- add %r11, %rbp
- mov $0, R32(%r11)
- mov -16(up,i,8), %r10
- adc $0, %r9
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -24(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov -8(up,i,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov (mp,i,8), %rax
- adc %rdx, %r9
- mov %r10, -16(up,i,8)
-L(e1): add %r11, %rbp
- adc $0, %r9
- mul q0
- mov (up,i,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -8(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov 8(up,i,8), %rbp
- mov %r10, (up,i,8)
- add $4, i
- jnc L(tp1)
-
-L(ed1): mul q0
- add %rax, %rbp
- adc $0, %rdx
- add %r11, %rbp
- adc $0, %rdx
- mov %rbp, I(-8(up),-24(up,i,8))
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b3): cmp $-3, R32(n)
- jz L(n3)
-
-L(otp3):lea 5(n), i
- mov (mp,n,8), %rax
- mov (up,n,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov 8(mp,n,8), %rax
- adc %rdx, %r9
- mul q0
- mov 8(up,n,8), %rbx
- mov $0, R32(%r11)
- add %rax, %rbx
- mov 16(mp,n,8), %rax
- adc %rdx, %r11
- add %r9, %rbx
- adc $0, %r11
- mov 16(up,n,8), %rbp
- mov %rbx, 8(up,n,8)
- imul u0inv, %rbx C next q limb
-C jmp L(tp3)
-
- ALIGNx
-L(tp3): mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov -16(mp,i,8), %rax
- adc %rdx, %r9
- mul q0
- add %r11, %rbp
- mov $0, R32(%r11)
- mov -16(up,i,8), %r10
- adc $0, %r9
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -24(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov -8(up,i,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov (mp,i,8), %rax
- adc %rdx, %r9
- mov %r10, -16(up,i,8)
- add %r11, %rbp
- adc $0, %r9
- mul q0
- mov (up,i,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -8(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov 8(up,i,8), %rbp
- mov %r10, (up,i,8)
- add $4, i
- jnc L(tp3)
-
-L(ed3): mul q0
- add %rax, %rbp
- adc $0, %rdx
- add %r11, %rbp
- adc $0, %rdx
- mov %rbp, I(-8(up),-24(up,i,8))
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp3)
-C jmp L(cj)
-
-L(cj):
-IFSTD(` lea (up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea (up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(bx0): test $2, R8(n)
- jnz L(b2)
-
-L(b0):
-L(otp0):lea 2(n), i
- mov (mp,n,8), %rax
- mul q0
- mov $0, R32(%r11)
- mov (up,n,8), %r10
- add %rax, %r10
- mov 8(mp,n,8), %rax
- adc %rdx, %r11
- mov 8(up,n,8), %rbx
- mul q0
- add %rax, %rbx
- mov $0, R32(%r9)
- mov 16(mp,n,8), %rax
- adc %rdx, %r9
- add %r11, %rbx
- adc $0, %r9
- mul q0
- mov 16(up,n,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 24(mp,n,8), %rax
- adc %rdx, %r11
- mov %rbx, 8(up,n,8)
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov -16(mp,i,8), %rax
- adc %rdx, %r9
- mul q0
- add %r11, %rbp
- mov $0, R32(%r11)
- mov -16(up,i,8), %r10
- adc $0, %r9
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -24(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov -8(up,i,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov (mp,i,8), %rax
- adc %rdx, %r9
- mov %r10, -16(up,i,8)
- add %r11, %rbp
- adc $0, %r9
- mul q0
- mov (up,i,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -8(up,i,8)
-L(e0): add %r9, %r10
- adc $0, %r11
- mov 8(up,i,8), %rbp
- mov %r10, (up,i,8)
- add $4, i
- jnc L(tp0)
-
-L(ed0): mul q0
- add %rax, %rbp
- adc $0, %rdx
- add %r11, %rbp
- adc $0, %rdx
- mov %rbp, I(-8(up),-24(up,i,8))
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
- jmp L(cj)
-
-L(b2): cmp $-2, R32(n)
- jz L(n2)
-
-L(otp2):lea 4(n), i
- mov (mp,n,8), %rax
- mul q0
- mov (up,n,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 8(mp,n,8), %rax
- adc %rdx, %r11
- mov 8(up,n,8), %rbx
- mul q0
- add %rax, %rbx
- mov $0, R32(%r9)
- mov 16(mp,n,8), %rax
- adc %rdx, %r9
- mul q0
- add %r11, %rbx
- mov $0, R32(%r11)
- mov 16(up,n,8), %r10
- adc $0, %r9
- add %rax, %r10
- mov 24(mp,n,8), %rax
- adc %rdx, %r11
- mov %rbx, 8(up,n,8)
- imul u0inv, %rbx C next q limb
- jmp L(e2)
-
- ALIGNx
-L(tp2): mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov -16(mp,i,8), %rax
- adc %rdx, %r9
- mul q0
- add %r11, %rbp
- mov $0, R32(%r11)
- mov -16(up,i,8), %r10
- adc $0, %r9
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -24(up,i,8)
-L(e2): add %r9, %r10
- adc $0, %r11
- mov -8(up,i,8), %rbp
- mul q0
- add %rax, %rbp
- mov $0, R32(%r9)
- mov (mp,i,8), %rax
- adc %rdx, %r9
- mov %r10, -16(up,i,8)
- add %r11, %rbp
- adc $0, %r9
- mul q0
- mov (up,i,8), %r10
- mov $0, R32(%r11)
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc %rdx, %r11
- mov %rbp, -8(up,i,8)
- add %r9, %r10
- adc $0, %r11
- mov 8(up,i,8), %rbp
- mov %r10, (up,i,8)
- add $4, i
- jnc L(tp2)
-
-L(ed2): mul q0
- add %rax, %rbp
- adc $0, %rdx
- add %r11, %rbp
- adc $0, %rdx
- mov %rbp, I(-8(up),-24(up,i,8))
- mov %rdx, (up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp2)
- jmp L(cj)
-
-L(n1): mov (mp_param), %rax
- mul q0
- add -8(up), %rax
- adc (up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov -8(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -16(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- mov (up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 8(up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -24(mp), %rax
- mov -24(up), %r10
- mul q0
- add %rax, %r10
- mov -16(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov -16(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -8(mp), %rax
- add %r11, %rbp
- mov -8(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, -16(up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, -8(up)
- mov %r11, -24(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
-
- mov -48(up), %rdx
- mov -40(up), %rbx
- xor R32(%rax), R32(%rax)
- add %rbp, %rdx
- adc %r10, %rbx
- adc -8(up), %r11
- mov %rdx, (rp)
- mov %rbx, 8(rp)
- mov %r11, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm b/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/addmul_2.asm b/gmp/mpn/x86_64/coreisbr/addmul_2.asm
deleted file mode 100644
index 21f0bf465f..0000000000
--- a/gmp/mpn/x86_64/coreisbr/addmul_2.asm
+++ /dev/null
@@ -1,224 +0,0 @@
-dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 2.93 this
-C Intel IBR 2.66 this
-C Intel HWL 2.5 2.15
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C This code is the result of running a code generation and optimisation tool
-C suite written by David Harvey and Torbjorn Granlund.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vp', `%rcx') C r9
-
-define(`n', `%rcx')
-define(`v0', `%rbx')
-define(`v1', `%rbp')
-define(`w0', `%r8')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-define(`X0', `%r12')
-define(`X1', `%r13')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_addmul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
- push %r12
- push %r13
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov (up), %rax
-
- mov n_param, n
- neg n
-
- lea (up,n_param,8), up
- lea 8(rp,n_param,8), rp
- mul v0
-
- test $1, R8(n)
- jnz L(bx1)
-
-L(bx0): mov -8(rp,n,8), X0
- mov %rdx, w1
- add %rax, X0
- adc $0, w1
- mov (up,n,8), %rax
- xor w0, w0
- xor w3, w3
- test $2, R8(n)
- jnz L(b10)
-
-L(b00): nop C this nop make loop go faster on SBR!
- mul v1
- mov (rp,n,8), X1
- jmp L(lo0)
-
-L(b10): lea -2(n), n
- jmp L(lo2)
-
-L(bx1): mov -8(rp,n,8), X1
- mov %rdx, w3
- add %rax, X1
- adc $0, w3
- mov (up,n,8), %rax
- xor w1, w1
- xor w2, w2
- test $2, R8(n)
- jz L(b11)
-
-L(b01): mov (rp,n,8), X0
- inc n
- jmp L(lo1)
-
-L(b11): dec n
- jmp L(lo3)
-
- ALIGN(32)
-L(top):
-L(lo1): mul v1
- mov %rdx, w0 C 1
- add %rax, X0 C 0
- adc $0, w0 C 1
- add w1, X1 C 3
- adc $0, w3 C 0
- add w2, X0 C 0
- adc $0, w0 C 1
- mov (up,n,8), %rax
- mul v0
- add %rax, X0 C 0
- mov %rdx, w1 C 1
- adc $0, w1 C 1
- mov (up,n,8), %rax
- mul v1
- mov X1, -16(rp,n,8) C 3
- mov (rp,n,8), X1 C 1
- add w3, X0 C 0
- adc $0, w1 C 1
-L(lo0): mov %rdx, w2 C 2
- mov X0, -8(rp,n,8) C 0
- add %rax, X1 C 1
- adc $0, w2 C 2
- mov 8(up,n,8), %rax
- add w0, X1 C 1
- adc $0, w2 C 2
- mul v0
- add %rax, X1 C 1
- mov %rdx, w3 C 2
- adc $0, w3 C 2
- mov 8(up,n,8), %rax
-L(lo3): mul v1
- add w1, X1 C 1
- mov 8(rp,n,8), X0 C 2
- adc $0, w3 C 2
- mov %rdx, w0 C 3
- add %rax, X0 C 2
- adc $0, w0 C 3
- mov 16(up,n,8), %rax
- mul v0
- add w2, X0 C 2
- mov X1, (rp,n,8) C 1
- mov %rdx, w1 C 3
- adc $0, w0 C 3
- add %rax, X0 C 2
- adc $0, w1 C 3
- mov 16(up,n,8), %rax
- add w3, X0 C 2
- adc $0, w1 C 3
-L(lo2): mul v1
- mov 16(rp,n,8), X1 C 3
- add %rax, X1 C 3
- mov %rdx, w2 C 4
- adc $0, w2 C 4
- mov 24(up,n,8), %rax
- mov X0, 8(rp,n,8) C 2
- mul v0
- add w0, X1 C 3
- mov %rdx, w3 C 4
- adc $0, w2 C 4
- add %rax, X1 C 3
- mov 24(up,n,8), %rax
- mov 24(rp,n,8), X0 C 0 useless but harmless final read
- adc $0, w3 C 4
- add $4, n
- jnc L(top)
-
-L(end): mul v1
- add w1, X1
- adc $0, w3
- add w2, %rax
- adc $0, %rdx
- mov X1, I(-16(rp),-16(rp,n,8))
- add w3, %rax
- adc $0, %rdx
- mov %rax, I(-8(rp),-8(rp,n,8))
- mov %rdx, %rax
-
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm
deleted file mode 100644
index 2319a80060..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm
+++ /dev/null
@@ -1,54 +0,0 @@
-dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
-dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 1)
-define(RSH, 63)
-
-ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_addlsh1_n)
- define(func_nc, mpn_addlsh1_nc)')
-ifdef(`OPERATION_rsblsh1_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsblsh1_n)
- define(func_nc, mpn_rsblsh1_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
-include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm
deleted file mode 100644
index 9416d5a164..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm
+++ /dev/null
@@ -1,56 +0,0 @@
-dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 1)
-dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 1) - up[]
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 62)
-
-ifdef(`OPERATION_addlsh2_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_addlsh2_n)
- define(func_nc, mpn_addlsh2_nc)')
-ifdef(`OPERATION_rsblsh2_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsblsh2_n)
- define(func_nc, mpn_rsblsh2_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with
-C refmpn_rsblsh2_nc
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n)
-include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm
deleted file mode 100644
index 23ace41889..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
-dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
-
-dnl Copyright 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 3.25
-C Intel NHM 4
-C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32))
-C Intel atom ?
-C VIA nano ?
-
-C This code probably runs close to optimally on Sandy Bridge (using 4-way
-C unrolling). It also runs reasonably well on Core 2, but it runs poorly on
-C all other processors, including Nehalem.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbp
- mov cy, %rax
- neg %rax C set msb on carry
- xor R32(%rbp), R32(%rbp) C limb carry
- mov (vp), %r8
- shrd $RSH, %r8, %rbp
- mov R32(n), R32(%r9)
- and $3, R32(%r9)
- je L(b00)
- cmp $2, R32(%r9)
- jc L(b01)
- je L(b10)
- jmp L(b11)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbp
- xor R32(%rbp), R32(%rbp) C limb carry
- mov (vp), %r8
- shrd $RSH, %r8, %rbp
- mov R32(n), R32(%rax)
- and $3, R32(%rax)
- je L(b00)
- cmp $2, R32(%rax)
- jc L(b01)
- je L(b10)
-
-L(b11): mov 8(vp), %r9
- shrd $RSH, %r9, %r8
- mov 16(vp), %r10
- shrd $RSH, %r10, %r9
- add R32(%rax), R32(%rax) C init carry flag
- ADCSBB (up), %rbp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- mov %rbp, (rp)
- mov %r8, 8(rp)
- mov %r9, 16(rp)
- mov %r10, %rbp
- lea 24(up), up
- lea 24(vp), vp
- lea 24(rp), rp
- sbb R32(%rax), R32(%rax) C save carry flag
- sub $3, n
- ja L(top)
- jmp L(end)
-
-L(b01): add R32(%rax), R32(%rax) C init carry flag
- ADCSBB (up), %rbp
- mov %rbp, (rp)
- mov %r8, %rbp
- lea 8(up), up
- lea 8(vp), vp
- lea 8(rp), rp
- sbb R32(%rax), R32(%rax) C save carry flag
- sub $1, n
- ja L(top)
- jmp L(end)
-
-L(b10): mov 8(vp), %r9
- shrd $RSH, %r9, %r8
- add R32(%rax), R32(%rax) C init carry flag
- ADCSBB (up), %rbp
- ADCSBB 8(up), %r8
- mov %rbp, (rp)
- mov %r8, 8(rp)
- mov %r9, %rbp
- lea 16(up), up
- lea 16(vp), vp
- lea 16(rp), rp
- sbb R32(%rax), R32(%rax) C save carry flag
- sub $2, n
- ja L(top)
- jmp L(end)
-
- ALIGN(16)
-L(top): mov (vp), %r8
- shrd $RSH, %r8, %rbp
-L(b00): mov 8(vp), %r9
- shrd $RSH, %r9, %r8
- mov 16(vp), %r10
- shrd $RSH, %r10, %r9
- mov 24(vp), %r11
- shrd $RSH, %r11, %r10
- lea 32(vp), vp
- add R32(%rax), R32(%rax) C restore carry flag
- ADCSBB (up), %rbp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- lea 32(up), up
- mov %rbp, (rp)
- mov %r8, 8(rp)
- mov %r9, 16(rp)
- mov %r10, 24(rp)
- mov %r11, %rbp
- lea 32(rp), rp
- sbb R32(%rax), R32(%rax) C save carry flag
- sub $4, n
- jnz L(top)
-
-L(end): shr $RSH, %rbp
- add R32(%rax), R32(%rax) C restore carry flag
- ADCSBB $0, %rbp
- mov %rbp, %rax
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm
deleted file mode 100644
index db8ee68849..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm
+++ /dev/null
@@ -1,215 +0,0 @@
-dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
-dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
-dnl Optimised for Sandy Bridge.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 5.25
-C Intel P4 ?
-C Intel core2 3.1
-C Intel NHM 3.95
-C Intel SBR 2.75
-C Intel atom ?
-C VIA nano ?
-
-C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
-C unrolling). The rest of the code is quite crude, and could perhaps be made
-C both smaller and faster.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cnt', `%r8')
-define(`cy', `%r9') C for _nc variant
-
-ifdef(`OPERATION_addlsh_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(IFRSB, )
- define(func_n, mpn_addlsh_n)
- define(func_nc, mpn_addlsh_nc)')
-ifdef(`OPERATION_rsblsh_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(IFRSB, `$1')
- define(func_n, mpn_rsblsh_n)
- define(func_nc, mpn_rsblsh_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
-C refmpn_rsblsh_nc
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ') C cnt
- push %rbx
- xor R32(%rbx), R32(%rbx) C clear CF save register
-L(ent): push %rbp
- mov R32(n), R32(%rbp)
- mov n, %rax
- mov R32(cnt), R32(%rcx)
- neg R32(%rcx)
- and $3, R32(%rbp)
- jz L(b0)
- lea -32(vp,%rbp,8), vp
- lea -32(up,%rbp,8), up
- lea -32(rp,%rbp,8), rp
- cmp $2, R32(%rbp)
- jc L(b1)
- jz L(b2)
-
-L(b3): xor %r8, %r8
- mov 8(vp), %r9
- mov 16(vp), %r10
- shrd R8(%rcx), %r9, %r8
- shrd R8(%rcx), %r10, %r9
- mov 24(vp), %r11
- shrd R8(%rcx), %r11, %r10
- sub $3, %rax
- jz L(3)
- add R32(%rbx), R32(%rbx)
- lea 32(vp), vp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- lea 32(up), up
- jmp L(lo3)
-L(3): add R32(%rbx), R32(%rbx)
- lea 32(vp), vp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- jmp L(wd3)
-
-L(b0): mov (vp), %r8
- mov 8(vp), %r9
- xor R32(%rbp), R32(%rbp)
- jmp L(lo0)
-
-L(b1): xor %r10, %r10
- mov 24(vp), %r11
- shrd R8(%rcx), %r11, %r10
- sub $1, %rax
- jz L(1)
- add R32(%rbx), R32(%rbx)
- lea 32(vp), vp
- ADCSBB 24(up), %r10
- lea 32(up), up
- mov (vp), %r8
- jmp L(lo1)
-L(1): add R32(%rbx), R32(%rbx)
- ADCSBB 24(up), %r10
- jmp L(wd1)
-
-L(b2): xor %r9, %r9
- mov 16(vp), %r10
- shrd R8(%rcx), %r10, %r9
- mov 24(vp), %r11
- shrd R8(%rcx), %r11, %r10
- sub $2, %rax
- jz L(2)
- add R32(%rbx), R32(%rbx)
- lea 32(vp), vp
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- lea 32(up), up
- jmp L(lo2)
-L(2): add R32(%rbx), R32(%rbx)
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- jmp L(wd2)
-
- ALIGN(32) C 16-byte alignment is not enough!
-L(top): shrd R8(%rcx), %r11, %r10
- add R32(%rbx), R32(%rbx)
- lea 32(vp), vp
- ADCSBB (up), %rbp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- mov %rbp, (rp)
- lea 32(up), up
-L(lo3): mov %r8, 8(rp)
-L(lo2): mov %r9, 16(rp)
- mov (vp), %r8
-L(lo1): mov %r10, 24(rp)
- mov 8(vp), %r9
- mov %r11, %rbp
- lea 32(rp), rp
- sbb R32(%rbx), R32(%rbx)
-L(lo0): shrd R8(%rcx), %r8, %rbp
- mov 16(vp), %r10
- shrd R8(%rcx), %r9, %r8
- shrd R8(%rcx), %r10, %r9
- mov 24(vp), %r11
- sub $4, %rax
- jg L(top)
-
- shrd R8(%rcx), %r11, %r10
- add R32(%rbx), R32(%rbx)
- ADCSBB (up), %rbp
- ADCSBB 8(up), %r8
- ADCSBB 16(up), %r9
- ADCSBB 24(up), %r10
- mov %rbp, (rp)
-L(wd3): mov %r8, 8(rp)
-L(wd2): mov %r9, 16(rp)
-L(wd1): mov %r10, 24(rp)
- adc R32(%rax), R32(%rax) C rax is zero after loop
- shr R8(%rcx), %r11
- ADDSUB %r11, %rax
-IFRSB( neg %rax)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ') C cnt
-IFDOS(` mov 64(%rsp), %r9 ') C cy
- push %rbx
- neg cy
- sbb R32(%rbx), R32(%rbx) C initialise CF save register
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/aors_n.asm b/gmp/mpn/x86_64/coreisbr/aors_n.asm
deleted file mode 100644
index 01abf78a0d..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aors_n.asm
+++ /dev/null
@@ -1,198 +0,0 @@
-dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
-dnl Haswell.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bull 1.82 average over 400-600
-C AMD pile 1.83 average over 400-600
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 1.55 fluctuates
-C Intel IBR 1.55 fluctuates
-C Intel HWL 1.33 fluctuates
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The loop of this code was manually written. It runs close to optimally on
-C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
-C It also runs slightly faster on average on AMD bull and pile.
-C
-C No micro-optimisation has been done.
-C
-C N.B.! The loop alignment padding insns are executed. If editing the code,
-C make sure the padding does not become excessive. It is now a 4-byte nop.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`vp', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
-
-ifdef(`OPERATION_add_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
-ifdef(`OPERATION_sub_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- xor %r8, %r8
-
-L(ent): mov R32(n), R32(%rax)
- shr $2, n
-
- test $1, R8(%rax)
- jnz L(bx1)
-
-L(bx0): test $2, R8(%rax)
- jnz L(b10)
-
-L(b00): neg %r8
- mov (up), %r8
- mov 8(up), %r9
- ADCSBB (vp), %r8
- ADCSBB 8(vp), %r9
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- ADCSBB 16(vp), %r10
- ADCSBB 24(vp), %r11
- lea 32(vp), vp
- lea -16(rp), rp
- jmp L(lo0)
-
-L(b10): neg %r8
- mov (up), %r10
- mov 8(up), %r11
- ADCSBB 0(vp), %r10
- ADCSBB 8(vp), %r11
- jrcxz L(e2)
- mov 16(up), %r8
- mov 24(up), %r9
- lea 16(up), up
- ADCSBB 16(vp), %r8
- ADCSBB 24(vp), %r9
- lea 16(vp), vp
- lea (rp), rp
- jmp L(lo2)
-
-L(e2): mov %r10, (rp)
- mov %r11, 8(rp)
- setc R8(%rax)
- FUNC_EXIT()
- ret
-
-L(bx1): test $2, R8(%rax)
- jnz L(b11)
-
-L(b01): neg %r8
- mov (up), %r11
- ADCSBB (vp), %r11
- jrcxz L(e1)
- mov 8(up), %r8
- mov 16(up), %r9
- lea 8(up), up
- lea -8(rp), rp
- ADCSBB 8(vp), %r8
- ADCSBB 16(vp), %r9
- lea 8(vp), vp
- jmp L(lo1)
-
-L(e1): mov %r11, (rp)
- setc R8(%rax)
- FUNC_EXIT()
- ret
-
-L(b11): neg %r8
- mov (up), %r9
- ADCSBB (vp), %r9
- mov 8(up), %r10
- mov 16(up), %r11
- lea 24(up), up
- ADCSBB 8(vp), %r10
- ADCSBB 16(vp), %r11
- lea 24(vp), vp
- mov %r9, (rp)
- lea 8(rp), rp
- jrcxz L(end)
-
- ALIGN(32)
-L(top): mov (up), %r8
- mov 8(up), %r9
- ADCSBB (vp), %r8
- ADCSBB 8(vp), %r9
-L(lo2): mov %r10, (rp)
-L(lo1): mov %r11, 8(rp)
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- ADCSBB 16(vp), %r10
- ADCSBB 24(vp), %r11
- lea 32(vp), vp
-L(lo0): mov %r8, 16(rp)
-L(lo3): mov %r9, 24(rp)
- lea 32(rp), rp
- dec n
- jnz L(top)
-
-L(end): mov R32(n), R32(%rax) C zero rax
- mov %r10, (rp)
- mov %r11, 8(rp)
- setc R8(%rax)
- FUNC_EXIT()
- ret
-EPILOGUE()
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- jmp L(ent)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm
deleted file mode 100644
index 9f01d9c061..0000000000
--- a/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm
+++ /dev/null
@@ -1,209 +0,0 @@
-dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 3.24 (average, fluctuating in 3.20-3.57)
-C Intel IBR 3.04
-C Intel HWL
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimization tool suite written by David Harvey and Torbjörn Granlund.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%rbx')
-
-define(`I',`$1')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`func', `mpn_submul_1')
-')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-IFDOS(` define(`up', ``%rsi'')') dnl
-IFDOS(` define(`rp', ``%rcx'')') dnl
-IFDOS(` define(`v0', ``%r9'')') dnl
-IFDOS(` define(`r9', ``rdi'')') dnl
-IFDOS(` define(`n_param',``%r8'')') dnl
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(func)
-
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax
- push %rbx
- lea (up,n_param,8), up
- lea (rp,n_param,8), rp
-
- test $1, R8(n_param)
- jnz L(b13)
-
-L(b02): xor R32(%r11), R32(%r11)
- test $2, R8(n_param)
- jnz L(b2)
-
-L(b0): mov $1, R32(n)
- sub n_param, n
- mul v0
- mov %rdx, %r9
- mov -8(rp,n,8), %r8
- jmp L(e0)
-
- ALIGN(16)
-L(b2): mov $-1, n
- sub n_param, n
- mul v0
- mov 8(rp,n,8), %r8
- mov %rdx, %r9
- jmp L(e2)
-
- ALIGN(16)
-L(b13): xor R32(%r9), R32(%r9)
- test $2, R8(n_param)
- jnz L(b3)
-
-L(b1): mov $2, R32(n)
- sub n_param, n
- jns L(1)
- mul v0
- mov -16(rp,n,8), %r10
- mov %rdx, %r11
- jmp L(e1)
-
- ALIGN(16)
-L(b3): xor R32(n), R32(n)
- sub n_param, n
- mul v0
- mov (rp,n,8), %r10
- jmp L(e3)
-
- ALIGN(32)
-L(top): mul v0
- mov -16(rp,n,8), %r10
- ADDSUB %r11, %r8
- mov %rdx, %r11
- adc $0, %r9
- mov %r8, -24(rp,n,8)
-L(e1): ADDSUB %rax, %r10
- mov -8(up,n,8), %rax
- adc $0, %r11
- mul v0
- ADDSUB %r9, %r10
- mov %rdx, %r9
- mov -8(rp,n,8), %r8
- adc $0, %r11
- mov %r10, -16(rp,n,8)
-L(e0): ADDSUB %rax, %r8
- adc $0, %r9
- mov (up,n,8), %rax
- mul v0
- mov (rp,n,8), %r10
- ADDSUB %r11, %r8
- mov %r8, -8(rp,n,8)
- adc $0, %r9
-L(e3): mov %rdx, %r11
- ADDSUB %rax, %r10
- mov 8(up,n,8), %rax
- adc $0, %r11
- mul v0
- mov 8(rp,n,8), %r8
- ADDSUB %r9, %r10
- mov %rdx, %r9
- mov %r10, (rp,n,8)
- adc $0, %r11
-L(e2): ADDSUB %rax, %r8
- adc $0, %r9
- mov 16(up,n,8), %rax
- add $4, n
- jnc L(top)
-
-L(end): mul v0
- mov I(-8(rp),-16(rp,n,8)), %r10
- ADDSUB %r11, %r8
- mov %rdx, %r11
- adc $0, %r9
- mov %r8, I(-16(rp),-24(rp,n,8))
- ADDSUB %rax, %r10
- adc $0, %r11
- ADDSUB %r9, %r10
- adc $0, %r11
- mov %r10, I(-8(rp),-16(rp,n,8))
- mov %r11, %rax
-
- pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-
- ALIGN(16)
-L(1): mul v0
- ADDSUB %rax, -8(rp)
- mov %rdx, %rax
- adc $0, %rax
- pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreisbr/divrem_1.asm b/gmp/mpn/x86_64/coreisbr/divrem_1.asm
deleted file mode 100644
index d9f371f785..0000000000
--- a/gmp/mpn/x86_64/coreisbr/divrem_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_divrem_1
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1)
-include_mpn(`x86_64/divrem_1.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp/mpn/x86_64/coreisbr/gmp-mparam.h
deleted file mode 100644
index 3a91b4c30e..0000000000
--- a/gmp/mpn/x86_64/coreisbr/gmp-mparam.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-/* 3300 MHz Core i5 Sandy Bridge */
-/* FFT tuning limit = 100000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 30
-
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 166
-#define MUL_TOOM6H_THRESHOLD 254
-#define MUL_TOOM8H_THRESHOLD 333
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 250
-#define SQR_TOOM6_THRESHOLD 348
-#define SQR_TOOM8_THRESHOLD 454
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 12
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 167,11}, { 95,10}, \
- { 191, 9}, { 383, 8}, { 767, 7}, { 1599, 8}, \
- { 831, 9}, { 447,10}, { 239,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
- { 575,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,10}, { 447,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,10}, { 543, 8}, \
- { 2175,11}, { 303,12}, { 159,11}, { 319,10}, \
- { 671,11}, { 367,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
- { 447,10}, { 895,11}, { 479,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 671,12}, { 351,11}, { 703,10}, \
- { 1407,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,12}, { 479,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 575,11}, \
- { 1151,12}, { 607,13}, { 319,12}, { 671,11}, \
- { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 831,13}, { 447,12}, { 959,11}, \
- { 1919,14}, { 255,13}, { 511,12}, { 1087,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1279,13}, \
- { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
- { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \
- { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
- { 2431,14}, { 639,13}, { 1343,12}, { 2687,13}, \
- { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,13}, { 2943,15}, { 767,14}, { 1535,13}, \
- { 3071,14}, { 1663,13}, { 3455,14}, { 1919,16}, \
- { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
- { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
- { 3455,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
- { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
- { 4863,15}, { 2815,14}, { 5887,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 203
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 336, 5}, { 11, 4}, { 23, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
- { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 63,10}, { 39, 9}, { 79,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 135,11}, { 79, 9}, { 319, 8}, \
- { 639,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 143,10}, { 287, 8}, \
- { 1151,10}, { 303, 6}, { 4863, 8}, { 1279, 9}, \
- { 671,11}, { 175,10}, { 367,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207, 9}, { 831,10}, \
- { 447,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511,11}, { 271, 9}, { 1087,10}, { 575,11}, \
- { 303,10}, { 607,11}, { 319,10}, { 671,11}, \
- { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
- { 415,12}, { 223,11}, { 447,10}, { 959,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 575,10}, \
- { 1151,11}, { 607,10}, { 1215,12}, { 319,11}, \
- { 671, 9}, { 2687,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 415,11}, { 831,12}, { 479,14}, \
- { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
- { 607,13}, { 319,12}, { 671,11}, { 1343,12}, \
- { 703,13}, { 383,12}, { 831,13}, { 447,12}, \
- { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
- { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \
- { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
- { 1343,12}, { 2687,13}, { 1407,12}, { 2815,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
- { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \
- { 3071,14}, { 1663,13}, { 3455,14}, { 1791,16}, \
- { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
- { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
- { 3455,15}, { 1791,14}, { 3839,16}, { 1023,15}, \
- { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \
- { 2815,14}, { 5887,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 190
-#define SQR_FFT_THRESHOLD 3264
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 62
-#define MULLO_MUL_N_THRESHOLD 8907
-
-#define DC_DIV_QR_THRESHOLD 52
-#define DC_DIVAPPR_Q_THRESHOLD 166
-#define DC_BDIV_QR_THRESHOLD 46
-#define DC_BDIV_Q_THRESHOLD 104
-
-#define INV_MULMOD_BNM1_THRESHOLD 42
-#define INV_NEWTON_THRESHOLD 166
-#define INV_APPR_THRESHOLD 165
-
-#define BINV_NEWTON_THRESHOLD 228
-#define REDC_1_TO_REDC_2_THRESHOLD 32
-#define REDC_2_TO_REDC_N_THRESHOLD 52
-
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1387
-#define MUPI_DIV_QR_THRESHOLD 69
-#define MU_BDIV_QR_THRESHOLD 1187
-#define MU_BDIV_Q_THRESHOLD 1334
-
-#define POWM_SEC_TABLE 3,22,194,452,1167
-
-#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 119
-#define HGCD_APPR_THRESHOLD 51
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 478
-#define GCDEXT_DC_THRESHOLD 368
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 22
-#define SET_STR_DC_THRESHOLD 802
-#define SET_STR_PRECOMPUTE_THRESHOLD 2042
-
-#define FAC_DSC_THRESHOLD 644
-#define FAC_ODD_THRESHOLD 24
diff --git a/gmp/mpn/x86_64/coreisbr/lshift.asm b/gmp/mpn/x86_64/coreisbr/lshift.asm
deleted file mode 100644
index a1cbc31f61..0000000000
--- a/gmp/mpn/x86_64/coreisbr/lshift.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_lshift)
-include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/lshiftc.asm b/gmp/mpn/x86_64/coreisbr/lshiftc.asm
deleted file mode 100644
index ac90edb76b..0000000000
--- a/gmp/mpn/x86_64/coreisbr/lshiftc.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_lshiftc)
-include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/mul_1.asm b/gmp/mpn/x86_64/coreisbr/mul_1.asm
deleted file mode 100644
index ded7d899c2..0000000000
--- a/gmp/mpn/x86_64/coreisbr/mul_1.asm
+++ /dev/null
@@ -1,161 +0,0 @@
-dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9
-C AMD K10
-C AMD bd1
-C AMD bobcat
-C Intel P4
-C Intel core2
-C Intel NHM
-C Intel SBR 2.5
-C Intel IBR 2.4
-C Intel atom
-C VIA nano
-
-C The loop of this code is the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C TODO
-C * The loop is great, but the prologue code was quickly written. Tune it!
-C * Add mul_1c entry point.
-C * We could preserve one less register under DOS64 calling conventions, using
-C r10 instead of rsi.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0', `%rcx') C r9
-
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(` define(`up', ``%rsi'')') dnl
-IFDOS(` define(`rp', ``%rcx'')') dnl
-IFDOS(` define(`v0', ``%r9'')') dnl
-IFDOS(` define(`r9', ``rdi'')') dnl
-IFDOS(` define(`n_param',``%r8'')') dnl
-IFDOS(` define(`n', ``%r8'')') dnl
-IFDOS(` define(`r8', ``r11'')') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_1)
-
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
- mov (up), %rax
- mov R32(`n_param'), R32(%r10)
-IFSTD(` mov n_param, n ')
-
- lea (up,n_param,8), up
- lea -8(rp,n_param,8), rp
- neg n
- mul v0
- and $3, R32(%r10)
- jz L(b0)
- cmp $2, R32(%r10)
- jb L(b1)
- jz L(b2)
-
-L(b3): add $-1, n
- mov %rax, %r9
- mov %rdx, %r8
- mov 16(up,n,8), %rax
- jmp L(L3)
-
-L(b1): mov %rax, %r9
- mov %rdx, %r8
- add $1, n
- jnc L(L1)
- mov %rax, (rp)
- mov %rdx, %rax
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-
-L(b2): add $-2, n
- mov %rax, %r8
- mov %rdx, %r9
- mov 24(up,n,8), %rax
- jmp L(L2)
-
-L(b0): mov %rax, %r8
- mov %rdx, %r9
- mov 8(up,n,8), %rax
- jmp L(L0)
-
- ALIGN(8)
-L(top): mov %rdx, %r8
- add %rax, %r9
-L(L1): mov 0(up,n,8), %rax
- adc $0, %r8
- mul v0
- add %rax, %r8
- mov %r9, 0(rp,n,8)
- mov 8(up,n,8), %rax
- mov %rdx, %r9
- adc $0, %r9
-L(L0): mul v0
- mov %r8, 8(rp,n,8)
- add %rax, %r9
- mov %rdx, %r8
- mov 16(up,n,8), %rax
- adc $0, %r8
-L(L3): mul v0
- mov %r9, 16(rp,n,8)
- mov %rdx, %r9
- add %rax, %r8
- mov 24(up,n,8), %rax
- adc $0, %r9
-L(L2): mul v0
- mov %r8, 24(rp,n,8)
- add $4, n
- jnc L(top)
-
-L(end): add %rax, %r9
- mov %rdx, %rax
- adc $0, %rax
- mov %r9, (rp)
-
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/mul_2.asm b/gmp/mpn/x86_64/coreisbr/mul_2.asm
deleted file mode 100644
index ffee78a385..0000000000
--- a/gmp/mpn/x86_64/coreisbr/mul_2.asm
+++ /dev/null
@@ -1,163 +0,0 @@
-dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb best
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 2.57 2.52 using 4-way code
-C Intel IBR 2.35 2.32 using 4-way code
-C Intel HWL 2.02 1.86
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C This code is the result of running a code generation and optimisation tool
-C suite written by David Harvey and Torbjorn Granlund.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vp', `%rcx') C r9
-
-define(`n', `%rcx')
-define(`v0', `%rbx')
-define(`v1', `%rbp')
-
-define(`w0', `%r8')
-define(`w1', `%r9')
-define(`w2', `%r10')
-define(`w3', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (vp), v0
- mov 8(vp), v1
-
- mov (up), %rax
- lea (up,n_param,8), up
- lea (rp,n_param,8), rp
-
- test $1, R8(n_param)
- jnz L(b1)
-
-L(b0): mov $0, R32(n)
- sub n_param, n
- xor w0, w0
- mul v0
- mov %rax, w2
- mov %rdx, w1
- mov (up,n,8), %rax
- jmp L(lo0)
-
-L(b1): mov $1, R32(n)
- sub n_param, n
- xor w2, w2
- mul v0
- mov %rax, w0
- mov %rdx, w3
- mov -8(up,n,8), %rax
- mul v1
- jmp L(lo1)
-
- ALIGN(32)
-L(top): mul v0
- add %rax, w0 C 1
- mov %rdx, w3 C 2
- adc $0, w3 C 2
- mov -8(up,n,8), %rax
- mul v1
- add w1, w0 C 1
- adc $0, w3 C 2
-L(lo1): add %rax, w2 C 2
- mov w0, -8(rp,n,8) C 1
- mov %rdx, w0 C 3
- adc $0, w0 C 3
- mov (up,n,8), %rax
- mul v0
- add %rax, w2 C 2
- mov %rdx, w1 C 3
- adc $0, w1 C 3
- add w3, w2 C 2
- mov (up,n,8), %rax
- adc $0, w1 C 1
-L(lo0): mul v1
- mov w2, (rp,n,8) C 2
- add %rax, w0 C 3
- mov %rdx, w2 C 4
- mov 8(up,n,8), %rax
- adc $0, w2 C 4
- add $2, n
- jnc L(top)
-
-L(end): mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
- mov I(-8(up),-8(up,n,8)), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, I(-8(rp),-8(rp,n,8))
- adc $0, %rdx
- add w3, w2
- mov w2, I((rp),(rp,n,8))
- adc $0, %rdx
- mov %rdx, %rax
-
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp/mpn/x86_64/coreisbr/mul_basecase.asm
deleted file mode 100644
index f026136ea0..0000000000
--- a/gmp/mpn/x86_64/coreisbr/mul_basecase.asm
+++ /dev/null
@@ -1,407 +0,0 @@
-dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 2.5 2.5 - 2.95
-C Intel IBR 2.4 2.3 - 2.68
-C Intel HWL 2.35 2.0 - 2.5
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C TODO
-C * Fix the addmul_2 fluctuation affecting SBR.
-C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
-C the loops at the expense of code size.
-C * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
-C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
-C speedup.
-C * Further micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-define(`vp', `%rcx')
-define(`vn', `%r8')
-
-define(`un', `%rbx')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`n', `%rbp')
-define(`v0', `%r9')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %rbx
- push %rbp
- mov un_param, un C free up rdx
- neg un
-
- mov (up), %rax C shared for mul_1 and mul_2
- lea (up,un_param,8), up C point at operand end
- lea (rp,un_param,8), rp C point at rp[un-1]
-
- mov (vp), v0 C shared for mul_1 and mul_2
- mul v0 C shared for mul_1 and mul_2
-
- test $1, R8(vn)
- jz L(do_mul_2)
-
-L(do_mul_1):
- test $1, R8(un)
- jnz L(m1x1)
-
-L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
- mov %rdx, w1
- mov 8(up,un,8), %rax
- test $2, R8(un)
- jnz L(m110)
-
-L(m100):lea 2(un), n C un = 4, 8, 12, ...
- jmp L(m1l0)
-
-L(m110):lea (un), n C un = 2, 6, 10, ...
- jmp L(m1l2)
-
-L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
- mov %rdx, w0
- test $2, R8(un)
- jz L(m111)
-
-L(m101):lea 3(un), n C un = 1, 5, 9, ...
- test n, n
- js L(m1l1)
- mov %rax, -8(rp)
- mov %rdx, (rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(m111):lea 1(un), n C un = 3, 7, 11, ...
- mov 8(up,un,8), %rax
- jmp L(m1l3)
-
- ALIGN(16) C FIXME
-L(m1tp):mov %rdx, w0
- add %rax, w1
-L(m1l1):mov -16(up,n,8), %rax
- adc $0, w0
- mul v0
- add %rax, w0
- mov w1, -24(rp,n,8)
- mov -8(up,n,8), %rax
- mov %rdx, w1
- adc $0, w1
-L(m1l0):mul v0
- mov w0, -16(rp,n,8)
- add %rax, w1
- mov %rdx, w0
- mov (up,n,8), %rax
- adc $0, w0
-L(m1l3):mul v0
- mov w1, -8(rp,n,8)
- mov %rdx, w1
- add %rax, w0
- mov 8(up,n,8), %rax
- adc $0, w1
-L(m1l2):mul v0
- mov w0, (rp,n,8)
- add $4, n
- jnc L(m1tp)
-
-L(m1ed):add %rax, w1
- adc $0, %rdx
- mov w1, I(-8(rp),-24(rp,n,8))
- mov %rdx, I((rp),-16(rp,n,8))
-
- dec R32(vn)
- jz L(ret2)
-
- lea 8(vp), vp
- lea 8(rp), rp
- push %r12
- push %r13
- push %r14
- jmp L(do_addmul)
-
-L(do_mul_2):
-define(`v1', `%r14')
- push %r12
- push %r13
- push %r14
-
- mov 8(vp), v1
-
- test $1, R8(un)
- jnz L(m2b1)
-
-L(m2b0):lea (un), n
- xor w0, w0
- mov %rax, w2
- mov %rdx, w1
- jmp L(m2l0)
-
-L(m2b1):lea 1(un), n
- xor w1, w1
- xor w2, w2
- mov %rax, w0
- mov %rdx, w3
- jmp L(m2l1)
-
- ALIGN(32)
-L(m2tp):mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
-L(m2l1):mov -8(up,n,8), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, -8(rp,n,8)
- mov %rdx, w0
- adc $0, w0
- mov (up,n,8), %rax
- mul v0
- add %rax, w2
- mov %rdx, w1
- adc $0, w1
- add w3, w2
-L(m2l0):mov (up,n,8), %rax
- adc $0, w1
- mul v1
- mov w2, (rp,n,8)
- add %rax, w0
- mov %rdx, w2
- mov 8(up,n,8), %rax
- adc $0, w2
- add $2, n
- jnc L(m2tp)
-
-L(m2ed):mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
- mov I(-8(up),-8(up,n,8)), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, I(-8(rp),-8(rp,n,8))
- adc $0, %rdx
- add w3, w2
- mov w2, I((rp),(rp,n,8))
- adc $0, %rdx
- mov %rdx, I(8(rp),8(rp,n,8))
-
- add $-2, R32(vn)
- jz L(ret5)
- lea 16(vp), vp
- lea 16(rp), rp
-
-
-L(do_addmul):
- push %r15
- push vn C save vn in new stack slot
-define(`vn', `(%rsp)')
-define(`X0', `%r14')
-define(`X1', `%r15')
-define(`v1', `%r8')
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
- mov (up,un,8), %rax
- mul v0
- test $1, R8(un)
- jnz L(a1x1)
-
-L(a1x0):mov (rp,un,8), X0
- xor w0, w0
- mov %rdx, w1
- test $2, R8(un)
- jnz L(a110)
-
-L(a100):lea 2(un), n C un = 4, 8, 12, ...
- add %rax, X0
- adc $0, w1
- mov (up,un,8), %rax
- mul v1
- mov 8(rp,un,8), X1 C FIXME: Use un
- jmp L(lo0)
-
-L(a110):lea (un), n C un = 2, 6, 10, ...
- xor w3, w3
- jmp L(lo2)
-
-L(a1x1):mov (rp,un,8), X1
- xor w2, w2
- xor w1, w1
- test $2, R8(un)
- jz L(a111)
-
-L(a101):lea 3(un), n C un = 1, 5, 9, ...
- mov %rdx, w3
- add %rax, X1
- mov (up,un,8), %rax
- mov 8(rp,un,8), X0
- adc $0, w3
- jmp L(top)
-
-L(a111):lea 1(un), n C un = 3, 7, 11, ...
- jmp L(lo3)
-
- ALIGN(32)
-L(top): mul v1
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- add w1, X1
- adc $0, w3
- add w2, X0
- adc $0, w0
- mov -16(up,n,8), %rax
- mul v0
- add %rax, X0
- mov %rdx, w1
- adc $0, w1
- mov -16(up,n,8), %rax
- mul v1
- mov X1, -24(rp,n,8)
- mov -8(rp,n,8), X1
- add w3, X0
- adc $0, w1
-L(lo0): mov %rdx, w2
- mov X0, -16(rp,n,8)
- add %rax, X1
- adc $0, w2
- mov -8(up,n,8), %rax
- add w0, X1
- adc $0, w2
- mul v0
-L(lo3): add %rax, X1
- mov %rdx, w3
- adc $0, w3
- mov -8(up,n,8), %rax
- mul v1
- add w1, X1
- mov (rp,n,8), X0
- adc $0, w3
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- mov (up,n,8), %rax
- mul v0
- add w2, X0
- mov X1, -8(rp,n,8)
- mov %rdx, w1
- adc $0, w0
-L(lo2): add %rax, X0
- adc $0, w1
- mov (up,n,8), %rax
- add w3, X0
- adc $0, w1
- mul v1
- mov 8(rp,n,8), X1
- add %rax, X1
- mov %rdx, w2
- adc $0, w2
- mov 8(up,n,8), %rax
- mov X0, (rp,n,8)
- mul v0
- add w0, X1
- mov %rdx, w3
- adc $0, w2
- add %rax, X1
- mov 8(up,n,8), %rax
- mov 16(rp,n,8), X0 C useless but harmless in final iter
- adc $0, w3
- add $4, n
- jnc L(top)
-
-L(end): mul v1
- add w1, X1
- adc $0, w3
- add w2, %rax
- adc $0, %rdx
- mov X1, I(-8(rp),-24(rp,n,8))
- add w3, %rax
- adc $0, %rdx
- mov %rax, I((rp),-16(rp,n,8))
- mov %rdx, I(8(rp),-8(rp,n,8))
-
- addl $-2, vn
- lea 16(vp), vp
- lea 16(rp), rp
- jnz L(outer)
-
- pop %rax C deallocate vn slot
- pop %r15
-L(ret5):pop %r14
- pop %r13
- pop %r12
-L(ret2):pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm
deleted file mode 100644
index a41a8acee4..0000000000
--- a/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm
+++ /dev/null
@@ -1,384 +0,0 @@
-dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2
-C AMD K8,K9
-C AMD K10
-C AMD bull
-C AMD pile
-C AMD steam
-C AMD bobcat
-C AMD jaguar
-C Intel P4
-C Intel core
-C Intel NHM
-C Intel SBR 2.5 2.95
-C Intel IBR 2.3 2.68
-C Intel HWL 2.0 2.5
-C Intel BWL
-C Intel atom
-C VIA nano
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Implement proper cor2, replacing current cor0.
-C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
-C * Micro-optimise.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n', `%rcx')
-
-define(`vp', `%r8')
-define(`X0', `%r14')
-define(`X1', `%r15')
-
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%r12')
-define(`w3', `%r13')
-define(`i', `%rbp')
-define(`v0', `%r9')
-define(`v1', `%rbx')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
-
- mov (up), %rax
- mov vp_param, vp
-
- cmp $4, n
- jb L(small)
-
- mov (vp_param), v0
- push %rbx
- lea (rp,n,8), rp C point rp at R[un]
- push %rbp
- lea (up,n,8), up C point up right after U's end
- push %r12
- neg n
- push %r13
- mul v0
- mov 8(vp), v1
-
- test $1, R8(n)
- jnz L(m2b1)
-
-L(m2b0):lea (n), i
- xor w0, w0
- mov %rax, w2
- mov %rdx, w1
- jmp L(m2l0)
-
-L(m2b1):lea 1(n), i
- xor w1, w1
- xor w2, w2
- mov %rax, w0
- mov %rdx, w3
- jmp L(m2l1)
-
- ALIGN(32)
-L(m2tp):mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
-L(m2l1):mov -8(up,i,8), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, -8(rp,i,8)
- mov %rdx, w0
- adc $0, w0
- mov (up,i,8), %rax
- mul v0
- add %rax, w2
- mov %rdx, w1
- adc $0, w1
- add w3, w2
-L(m2l0):mov (up,i,8), %rax
- adc $0, w1
- mul v1
- mov w2, (rp,i,8)
- add %rax, w0
- mov %rdx, w2 C FIXME: dead in last iteration
- mov 8(up,i,8), %rax
- adc $0, w2 C FIXME: dead in last iteration
- add $2, i
- jnc L(m2tp)
-
-L(m2ed):imul v0, %rax
- add w0, %rax
- add w1, %rax
- mov %rax, I(-8(rp),-8(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jge L(cor1)
-
- push %r14
- push %r15
-
-L(outer):
- mov (vp), v0
- mov 8(vp), v1
- mov (up,n,8), %rax
- mul v0
- test $1, R8(n)
- jnz L(a1x1)
-
-L(a1x0):mov (rp,n,8), X1
- xor w2, w2
- xor w1, w1
- test $2, R8(n)
- jnz L(a110)
-
-L(a100):lea 1(n), i
- jmp L(lo0)
-
-L(a110):lea 3(n), i
- mov %rdx, w3
- add %rax, X1
- mov (up,n,8), %rax
- mov 8(rp,n,8), X0
- adc $0, w3
- jmp L(lo2)
-
-L(a1x1):mov (rp,n,8), X0
- xor w0, w0
- mov %rdx, w1
- test $2, R8(n)
- jz L(a111)
-
-L(a101):lea 2(n), i
- add %rax, X0
- adc $0, w1
- mov (up,n,8), %rax
- mul v1
- mov 8(rp,n,8), X1
- jmp L(lo1)
-
-L(a111):lea (n), i
- xor w3, w3
- jmp L(lo3)
-
- ALIGN(32)
-L(top):
-L(lo2): mul v1
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- add w1, X1
- adc $0, w3
- add w2, X0
- adc $0, w0
- mov -16(up,i,8), %rax
- mul v0
- add %rax, X0
- mov %rdx, w1
- adc $0, w1
- mov -16(up,i,8), %rax
- mul v1
- mov X1, -24(rp,i,8)
- mov -8(rp,i,8), X1
- add w3, X0
- adc $0, w1
-L(lo1): mov %rdx, w2
- mov X0, -16(rp,i,8)
- add %rax, X1
- adc $0, w2
- mov -8(up,i,8), %rax
- add w0, X1
- adc $0, w2
- mul v0
-L(lo0): add %rax, X1
- mov %rdx, w3
- adc $0, w3
- mov -8(up,i,8), %rax
- mul v1
- add w1, X1
- mov (rp,i,8), X0
- adc $0, w3
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- mov (up,i,8), %rax
- mul v0
- add w2, X0
- mov X1, -8(rp,i,8)
- mov %rdx, w1
- adc $0, w0
-L(lo3): add %rax, X0
- adc $0, w1
- mov (up,i,8), %rax
- add w3, X0
- adc $0, w1
- mul v1
- mov 8(rp,i,8), X1
- add %rax, X1
- mov %rdx, w2
- adc $0, w2
- mov 8(up,i,8), %rax
- mov X0, (rp,i,8)
- mul v0
- add w0, X1
- mov %rdx, w3
- adc $0, w2
- add %rax, X1
- mov 8(up,i,8), %rax
- mov 16(rp,i,8), X0
- adc $0, w3
- add $4, i
- jnc L(top)
-
-L(end): imul v1, %rax
- add %rax, X0
- add w1, X1
- adc $0, w3
- add w2, X0
- mov I(-8(up),-16(up,i,8)), %rax
- imul v0, %rax
- add X0, %rax
- mov X1, I(-16(rp),-24(rp,i,8))
- add w3, %rax
- mov %rax, I(-8(rp),-16(rp,i,8))
-
- add $2, n
- lea 16(vp), vp
- lea -16(up), up
- cmp $-2, n
- jl L(outer)
-
- pop %r15
- pop %r14
-
- jnz L(cor0)
-
-L(cor1):mov (vp), v0
- mov 8(vp), v1
- mov -16(up), %rax
- mul v0 C u0 x v2
- add -16(rp), %rax C FIXME: rp[0] still available in reg?
- adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
- mov -8(up), %r10
- imul v0, %r10
- mov -16(up), %r11
- imul v1, %r11
- mov %rax, -16(rp)
- add %r10, %r11
- add %rdx, %r11
- mov %r11, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(cor0):mov (vp), %r11
- imul -8(up), %r11
- add %rax, %r11
- mov %r11, -8(rp)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(small):
- cmp $2, n
- jae L(gt1)
-L(n1): imul (vp_param), %rax
- mov %rax, (rp)
- FUNC_EXIT()
- ret
-L(gt1): ja L(gt2)
-L(n2): mov (vp_param), %r9
- mul %r9
- mov %rax, (rp)
- mov 8(up), %rax
- imul %r9, %rax
- add %rax, %rdx
- mov 8(vp), %r9
- mov (up), %rcx
- imul %r9, %rcx
- add %rcx, %rdx
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-L(gt2):
-L(n3): mov (vp_param), %r9
- mul %r9 C u0 x v0
- mov %rax, (rp)
- mov %rdx, %r10
- mov 8(up), %rax
- mul %r9 C u1 x v0
- imul 16(up), %r9 C u2 x v0
- add %rax, %r10
- adc %rdx, %r9
- mov 8(vp), %r11
- mov (up), %rax
- mul %r11 C u0 x v1
- add %rax, %r10
- adc %rdx, %r9
- imul 8(up), %r11 C u1 x v1
- add %r11, %r9
- mov %r10, 8(rp)
- mov 16(vp), %r10
- mov (up), %rax
- imul %rax, %r10 C u0 x v2
- add %r10, %r9
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/popcount.asm b/gmp/mpn/x86_64/coreisbr/popcount.asm
deleted file mode 100644
index a5be33e6a7..0000000000
--- a/gmp/mpn/x86_64/coreisbr/popcount.asm
+++ /dev/null
@@ -1,118 +0,0 @@
-dnl AMD64 mpn_popcount -- population count.
-
-dnl Copyright 2008, 2010-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 1.5 slower than 8-way non-pipelined code
-C AMD bd1 4.2
-C AMD bobcat 6.28 slower than 8-way non-pipelined code
-C Intel P4 n/a
-C Intel core2 n/a
-C Intel NHM 1.32
-C Intel SBR 1.05 fluctuating
-C Intel IBR 1.05 fluctuating
-C Intel HSW 1
-C Intel atom n/a
-C VIA nano n/a
-
-define(`up', `%rdi')
-define(`n_param', `%rsi')
-
-define(`n', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_popcount)
- FUNC_ENTRY(2)
-
- lea (up,n_param,8), up
- xor R32(%rax), R32(%rax)
-
- test $1, R8(n_param)
- jnz L(bx1)
-
-L(bx0): test $2, R8(n_param)
- jnz L(b10)
-
-L(b00): mov $0, R32(n)
- sub n_param, n
- .byte 0xf3,0x4c,0x0f,0xb8,0x04,0xcf C popcnt (up,n,8), %r8
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xcf,0x08 C popcnt 8(up,n,8), %r9
- jmp L(lo0)
-
-L(b10): mov $2, R32(n)
- sub n_param, n
- .byte 0xf3,0x4c,0x0f,0xb8,0x54,0xcf,0xf0 C popcnt -16(up,n,8), %r10
- .byte 0xf3,0x4c,0x0f,0xb8,0x5c,0xcf,0xf8 C popcnt -8(up,n,8), %r11
- test n, n
- jz L(cj2)
- jmp L(lo2)
-
-L(bx1): test $2, R8(n_param)
- jnz L(b11)
-
-L(b01): mov $1, R32(n)
- sub n_param, n
- .byte 0xf3,0x4c,0x0f,0xb8,0x5c,0xcf,0xf8 C popcnt -8(up,n,8), %r11
- test n, n
- jz L(cj1)
- .byte 0xf3,0x4c,0x0f,0xb8,0x04,0xcf C popcnt 0(up,n,8), %r8
- jmp L(lo1)
-
-L(b11): mov $-1, n
- sub n_param, n
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xcf,0x08 C popcnt 8(up,n,8), %r9
- .byte 0xf3,0x4c,0x0f,0xb8,0x54,0xcf,0x10 C popcnt 16(up,n,8), %r10
- jmp L(lo3)
-
- ALIGN(32)
-L(top): add %r9, %rax
-L(lo2): .byte 0xf3,0x4c,0x0f,0xb8,0x04,0xcf C popcnt 0(up,n,8), %r8
- add %r10, %rax
-L(lo1): .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xcf,0x08 C popcnt 8(up,n,8), %r9
- add %r11, %rax
-L(lo0): .byte 0xf3,0x4c,0x0f,0xb8,0x54,0xcf,0x10 C popcnt 16(up,n,8), %r10
- add %r8, %rax
-L(lo3): .byte 0xf3,0x4c,0x0f,0xb8,0x5c,0xcf,0x18 C popcnt 24(up,n,8), %r11
- add $4, n
- js L(top)
-
-L(end): add %r9, %rax
-L(cj2): add %r10, %rax
-L(cj1): add %r11, %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/redc_1.asm b/gmp/mpn/x86_64/coreisbr/redc_1.asm
deleted file mode 100644
index 8a5170e3fd..0000000000
--- a/gmp/mpn/x86_64/coreisbr/redc_1.asm
+++ /dev/null
@@ -1,541 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat ?
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core ?
-C Intel NHM ?
-C Intel SBR 3.24
-C Intel IBR 3.04
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * Consider inlining mpn_add_n.
-C * Single basecases out before the pushes.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r14')
-define(`j', `%r15')
-define(`mp', `%r12')
-define(`q0', `%r13')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-define(`ALIGNx', `ALIGN(16)')
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (up), q0
- mov n, j C outer loop induction var
- lea 8(mp_param,n,8), mp
- lea 8(up,n,8), up
- neg n
- imul u0inv, q0 C first iteration q0
-
- test $1, R8(n)
- jz L(bx0)
-
-L(bx1): test $2, R8(n)
- jz L(b3)
-
-L(b1): cmp $-1, R32(n)
- jz L(n1)
-
-L(otp1):lea 1(n), i
- mov -8(mp,n,8), %rax
- mul q0
- mov -8(up,n,8), %r10
- mov %rdx, %r11
- add %rax, %r10
- mov (mp,n,8), %rax
- adc $0, %r11
- mul q0
- mov %rdx, %r9
- mov (up,n,8), %rbx
- add %rax, %rbx
- adc $0, %r9
- mov (mp,i,8), %rax
- mul q0
- mov (up,i,8), %r10
- add %r11, %rbx
- mov %rbx, -8(up,i,8) C next low remainder limb
- adc $0, %r9
- imul u0inv, %rbx C next q limb
- jmp L(e1)
-
- ALIGNx
-L(tp1): mul q0
- mov -16(up,i,8), %r10
- add %r11, %rbp
- mov %rdx, %r11
- adc $0, %r9
- mov %rbp, -24(up,i,8)
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- add %r9, %r10
- mov %rdx, %r9
- mov -8(up,i,8), %rbp
- adc $0, %r11
- mov %r10, -16(up,i,8)
- add %rax, %rbp
- adc $0, %r9
- mov (mp,i,8), %rax
- mul q0
- mov (up,i,8), %r10
- add %r11, %rbp
- mov %rbp, -8(up,i,8)
- adc $0, %r9
-L(e1): mov %rdx, %r11
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- mov 8(up,i,8), %rbp
- add %r9, %r10
- mov %rdx, %r9
- mov %r10, (up,i,8)
- adc $0, %r11
- add %rax, %rbp
- adc $0, %r9
- mov 16(mp,i,8), %rax
- add $4, i
- jnc L(tp1)
-
-L(ed1): mul q0
- mov I(-16(up),-16(up,i,8)), %r10
- add %r11, %rbp
- adc $0, %r9
- mov %rbp, I(-24(up),-24(up,i,8))
- add %rax, %r10
- adc $0, %rdx
- add %r9, %r10
- adc $0, %rdx
- mov %r10, I(-16(up),-16(up,i,8))
- mov %rdx, -8(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp1)
- jmp L(cj)
-
-L(b3): cmp $-3, R32(n)
- jz L(n3)
-
-L(otp3):lea 3(n), i
- mov -8(mp,n,8), %rax
- mul q0
- mov -8(up,n,8), %r10
- mov %rdx, %r11
- add %rax, %r10
- mov (mp,n,8), %rax
- adc $0, %r11
- mul q0
- mov (up,n,8), %rbx
- mov %rdx, %r9
- add %rax, %rbx
- adc $0, %r9
- mov 8(mp,n,8), %rax
- mul q0
- mov 8(up,n,8), %r10
- add %r11, %rbx
- mov %rdx, %r11
- adc $0, %r9
- mov %rbx, (up,n,8)
- imul u0inv, %rbx C next q limb
- jmp L(e3)
-
- ALIGNx
-L(tp3): mul q0
- mov -16(up,i,8), %r10
- add %r11, %rbp
- mov %rdx, %r11
- adc $0, %r9
- mov %rbp, -24(up,i,8)
-L(e3): add %rax, %r10
- mov -8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- add %r9, %r10
- mov %rdx, %r9
- mov -8(up,i,8), %rbp
- adc $0, %r11
- mov %r10, -16(up,i,8)
- add %rax, %rbp
- adc $0, %r9
- mov (mp,i,8), %rax
- mul q0
- mov (up,i,8), %r10
- add %r11, %rbp
- mov %rbp, -8(up,i,8)
- adc $0, %r9
- mov %rdx, %r11
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- mov 8(up,i,8), %rbp
- add %r9, %r10
- mov %rdx, %r9
- mov %r10, (up,i,8)
- adc $0, %r11
- add %rax, %rbp
- adc $0, %r9
- mov 16(mp,i,8), %rax
- add $4, i
- jnc L(tp3)
-
-L(ed3): mul q0
- mov I(-16(up),-16(up,i,8)), %r10
- add %r11, %rbp
- adc $0, %r9
- mov %rbp, I(-24(up),-24(up,i,8))
- add %rax, %r10
- adc $0, %rdx
- add %r9, %r10
- adc $0, %rdx
- mov %r10, I(-16(up),-16(up,i,8))
- mov %rdx, -8(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp3)
-C jmp L(cj)
-
-L(cj):
-IFSTD(` lea -8(up,n,8), up C param 2: up
- lea (up,n,8), %rdx C param 3: up - n
- neg R32(n) ') C param 4: n
-
-IFDOS(` lea -8(up,n,8), %rdx C param 2: up
- lea (%rdx,n,8), %r8 C param 3: up - n
- neg R32(n)
- mov n, %r9 C param 4: n
- mov rp, %rcx ') C param 1: rp
-
- CALL( mpn_add_n)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(bx0): test $2, R8(n)
- jnz L(b2)
-
-L(b0):
-L(otp0):lea (n), i
- mov -8(mp,n,8), %rax
- mul q0
- mov %rdx, %r9
- mov -8(up,n,8), %rbp
- add %rax, %rbp
- adc $0, %r9
- mov (mp,n,8), %rax
- mul q0
- mov (up,n,8), %rbx
- mov %rdx, %r11
- add %rax, %rbx
- mov 8(mp,n,8), %rax
- adc $0, %r11
- mul q0
- mov 8(up,n,8), %rbp
- add %r9, %rbx
- mov %rdx, %r9
- mov %rbx, (up,n,8)
- adc $0, %r11
- imul u0inv, %rbx C next q limb
- jmp L(e0)
-
- ALIGNx
-L(tp0): mul q0
- mov -16(up,i,8), %r10
- add %r11, %rbp
- mov %rdx, %r11
- adc $0, %r9
- mov %rbp, -24(up,i,8)
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- add %r9, %r10
- mov %rdx, %r9
- mov -8(up,i,8), %rbp
- adc $0, %r11
- mov %r10, -16(up,i,8)
- add %rax, %rbp
- adc $0, %r9
- mov (mp,i,8), %rax
- mul q0
- mov (up,i,8), %r10
- add %r11, %rbp
- mov %rbp, -8(up,i,8)
- adc $0, %r9
- mov %rdx, %r11
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- mov 8(up,i,8), %rbp
- add %r9, %r10
- mov %rdx, %r9
- mov %r10, (up,i,8)
- adc $0, %r11
-L(e0): add %rax, %rbp
- adc $0, %r9
- mov 16(mp,i,8), %rax
- add $4, i
- jnc L(tp0)
-
-L(ed0): mul q0
- mov I(-16(up),-16(up,i,8)), %r10
- add %r11, %rbp
- adc $0, %r9
- mov %rbp, I(-24(up),-24(up,i,8))
- add %rax, %r10
- adc $0, %rdx
- add %r9, %r10
- adc $0, %rdx
- mov %r10, I(-16(up),-16(up,i,8))
- mov %rdx, -8(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp0)
- jmp L(cj)
-
-L(b2): cmp $-2, R32(n)
- jz L(n2)
-
-L(otp2):lea 2(n), i
- mov -8(mp,n,8), %rax
- mul q0
- mov -8(up,n,8), %rbp
- mov %rdx, %r9
- add %rax, %rbp
- adc $0, %r9
- mov (mp,n,8), %rax
- mul q0
- mov (up,n,8), %rbx
- mov %rdx, %r11
- add %rax, %rbx
- mov 8(mp,n,8), %rax
- adc $0, %r11
- mul q0
- add %r9, %rbx
- mov %rdx, %r9
- mov 8(up,n,8), %rbp
- adc $0, %r11
- mov %rbx, (up,n,8)
- imul u0inv, %rbx C next q limb
- jmp L(e2)
-
- ALIGNx
-L(tp2): mul q0
- mov -16(up,i,8), %r10
- add %r11, %rbp
- mov %rdx, %r11
- adc $0, %r9
- mov %rbp, -24(up,i,8)
- add %rax, %r10
- mov -8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- add %r9, %r10
- mov %rdx, %r9
- mov -8(up,i,8), %rbp
- adc $0, %r11
- mov %r10, -16(up,i,8)
-L(e2): add %rax, %rbp
- adc $0, %r9
- mov (mp,i,8), %rax
- mul q0
- mov (up,i,8), %r10
- add %r11, %rbp
- mov %rbp, -8(up,i,8)
- adc $0, %r9
- mov %rdx, %r11
- add %rax, %r10
- mov 8(mp,i,8), %rax
- adc $0, %r11
- mul q0
- mov 8(up,i,8), %rbp
- add %r9, %r10
- mov %rdx, %r9
- mov %r10, (up,i,8)
- adc $0, %r11
- add %rax, %rbp
- adc $0, %r9
- mov 16(mp,i,8), %rax
- add $4, i
- jnc L(tp2)
-
-L(ed2): mul q0
- mov I(-16(up),-16(up,i,8)), %r10
- add %r11, %rbp
- adc $0, %r9
- mov %rbp, I(-24(up),-24(up,i,8))
- add %rax, %r10
- adc $0, %rdx
- add %r9, %r10
- adc $0, %rdx
- mov %r10, I(-16(up),-16(up,i,8))
- mov %rdx, -8(up,n,8) C up[0]
- mov %rbx, q0 C previously computed q limb -> q0
- lea 8(up), up C up++
- dec j
- jnz L(otp2)
- jmp L(cj)
-
-L(n1): mov (mp_param), %rax
- mul q0
- add -16(up), %rax
- adc -8(up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-L(n2): mov (mp_param), %rax
- mov -24(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -16(mp), %rax
- mov -16(up), %r10
- mul q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- add %r9, %r10
- adc $0, %r11
- mov %r10, q0
- imul u0inv, q0 C next q0
- mov -24(mp), %rax
- mul q0
- add %rax, %r10
- mov %rdx, %r9
- adc $0, %r9
- mov -16(mp), %rax
- mov -8(up), %r14
- mul q0
- add %rax, %r14
- adc $0, %rdx
- add %r9, %r14
- adc $0, %rdx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc (up), %rdx
- mov %r14, (rp)
- mov %rdx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
- ALIGNx
-L(n3): mov -32(mp), %rax
- mov -32(up), %r10
- mul q0
- add %rax, %r10
- mov -24(mp), %rax
- mov %rdx, %r11
- adc $0, %r11
- mov -24(up), %rbp
- mul q0
- add %rax, %rbp
- mov %rdx, %r9
- adc $0, %r9
- mov -16(mp), %rax
- add %r11, %rbp
- mov -16(up), %r10
- adc $0, %r9
- mul q0
- mov %rbp, q0
- imul u0inv, q0 C next q0
- add %rax, %r10
- mov %rdx, %r11
- adc $0, %r11
- mov %rbp, -24(up)
- add %r9, %r10
- adc $0, %r11
- mov %r10, -16(up)
- mov %r11, -32(up) C up[0]
- lea 8(up), up C up++
- dec j
- jnz L(n3)
- jmp L(cj)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm
deleted file mode 100644
index fd2eaea7bb..0000000000
--- a/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge.
-
-dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 4.25
-C Intel P4 21.5
-C Intel core2 3.2
-C Intel NHM 3.87
-C Intel SBR 2.05
-C Intel atom ?
-C VIA nano 44.9
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
-
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
- push %rbp
-
- neg %r8 C set C flag from parameter
- mov (up), %rbp
- ADCSBB (vp), %rbp
-
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
- push %rbp
-
- mov (up), %rbp
- ADDSUB (vp), %rbp
-L(ent):
- sbb R32(%rbx), R32(%rbx) C save cy
- mov R32(%rbp), R32(%rax)
- and $1, R32(%rax) C return value
-
- mov R32(n), R32(%r11)
- and $3, R32(%r11)
-
- cmp $1, R32(%r11)
- je L(do) C jump if n = 1 5 9 ...
-
-L(n1): cmp $2, R32(%r11)
- jne L(n2) C jump unless n = 2 6 10 ...
- add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up), %r10
- ADCSBB 8(vp), %r10
- lea 8(up), up
- lea 8(vp), vp
- lea 8(rp), rp
- sbb R32(%rbx), R32(%rbx) C save cy
-
- shrd $1, %r10, %rbp
- mov %rbp, -8(rp)
- jmp L(cj1)
-
-L(n2): cmp $3, R32(%r11)
- jne L(n3) C jump unless n = 3 7 11 ...
- add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up), %r9
- mov 16(up), %r10
- ADCSBB 8(vp), %r9
- ADCSBB 16(vp), %r10
- lea 16(up), up
- lea 16(vp), vp
- lea 16(rp), rp
- sbb R32(%rbx), R32(%rbx) C save cy
-
- shrd $1, %r9, %rbp
- mov %rbp, -16(rp)
- jmp L(cj2)
-
-L(n3): dec n C come here for n = 4 8 12 ...
- add R32(%rbx), R32(%rbx) C restore cy
- mov 8(up), %r8
- mov 16(up), %r9
- ADCSBB 8(vp), %r8
- ADCSBB 16(vp), %r9
- mov 24(up), %r10
- ADCSBB 24(vp), %r10
- lea 24(up), up
- lea 24(vp), vp
- lea 24(rp), rp
- sbb R32(%rbx), R32(%rbx) C save cy
-
- shrd $1, %r8, %rbp
- mov %rbp, -24(rp)
- shrd $1, %r9, %r8
- mov %r8, -16(rp)
-L(cj2): shrd $1, %r10, %r9
- mov %r9, -8(rp)
-L(cj1): mov %r10, %rbp
-
-L(do):
- shr $2, n C 4
- je L(end) C 2
- ALIGN(16)
-L(top): add R32(%rbx), R32(%rbx) C restore cy
-
- mov 8(up), %r8
- mov 16(up), %r9
- ADCSBB 8(vp), %r8
- ADCSBB 16(vp), %r9
- mov 24(up), %r10
- mov 32(up), %r11
- ADCSBB 24(vp), %r10
- ADCSBB 32(vp), %r11
-
- lea 32(up), up
- lea 32(vp), vp
-
- sbb R32(%rbx), R32(%rbx) C save cy
-
- shrd $1, %r8, %rbp
- mov %rbp, (rp)
- shrd $1, %r9, %r8
- mov %r8, 8(rp)
- shrd $1, %r10, %r9
- mov %r9, 16(rp)
- shrd $1, %r11, %r10
- mov %r10, 24(rp)
-
- dec n
- mov %r11, %rbp
- lea 32(rp), rp
- jne L(top)
-
-L(end): shrd $1, %rbx, %rbp
- mov %rbp, (rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/coreisbr/rshift.asm b/gmp/mpn/x86_64/coreisbr/rshift.asm
deleted file mode 100644
index 4c1c0d4cde..0000000000
--- a/gmp/mpn/x86_64/coreisbr/rshift.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_rshift)
-include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm
deleted file mode 100644
index 46a36121fe..0000000000
--- a/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm
+++ /dev/null
@@ -1,484 +0,0 @@
-dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
-C AMD K8,K9 ? ? ?
-C AMD K10 ? ? ?
-C AMD bull ? ? ?
-C AMD pile ? ? ?
-C AMD steam ? ? ?
-C AMD bobcat ? ? ?
-C AMD jaguar ? ? ?
-C Intel P4 ? ? ?
-C Intel core ? ? ?
-C Intel NHM ? ? ?
-C Intel SBR 2.57 2.93 3.0
-C Intel IBR 2.35 2.66 3.0
-C Intel HWL 2.02 2.5 2.5
-C Intel BWL ? ? ?
-C Intel atom ? ? ?
-C VIA nano ? ? ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
-C that the sqr_diag_addlsh1 loop was manually written.
-
-C TODO
-C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
-C * Streamline pointer updates.
-C * Perhaps suppress a few more xor insns in feed-in code.
-C * Make sure we write no dead registers in feed-in code.
-C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
-C out for negative sizes being zero-extended, though.
-C * The straight-line code for n <= 3 comes from the K8 code, and might be
-C quite sub-optimal here. Write specific code, and add code for n = 4.
-C * The mul_2 loop has a 10 insn common sequence in the loop start and the
-C wind-down code. Try re-rolling it.
-C * This file has been the subject to just basic micro-optimisation.
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
-
- cmp $2, un_param
- jae L(gt1)
-
- mov (up), %rax
- mul %rax
- mov %rax, (rp)
- mov %rdx, 8(rp)
- FUNC_EXIT()
- ret
-
-L(gt1): jne L(gt2)
-
- mov (up), %rax
- mov %rax, %r8
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, %r9
- mul %rax
- mov %rax, %r10
- mov %r11, %rax
- mov %rdx, %r11
- mul %r8
- xor %r8, %r8
- add %rax, %r9
- adc %rdx, %r10
- adc %r8, %r11
- add %rax, %r9
- mov %r9, 8(rp)
- adc %rdx, %r10
- mov %r10, 16(rp)
- adc %r8, %r11
- mov %r11, 24(rp)
- FUNC_EXIT()
- ret
-
-L(gt2): cmp $4, un_param
- jae L(gt3)
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w2', `%r11')
-
- mov (up), %rax
- mov %rax, %r10
- mul %rax
- mov 8(up), %r11
- mov %rax, (rp)
- mov %r11, %rax
- mov %rdx, 8(rp)
- mul %rax
- mov 16(up), %rcx
- mov %rax, 16(rp)
- mov %rcx, %rax
- mov %rdx, 24(rp)
- mul %rax
- mov %rax, 32(rp)
- mov %rdx, 40(rp)
-
- mov %r11, %rax
- mul %r10
- mov %rax, %r8
- mov %rcx, %rax
- mov %rdx, %r9
- mul %r10
- xor %r10, %r10
- add %rax, %r9
- mov %r11, %rax
- mov %r10, %r11
- adc %rdx, %r10
-
- mul %rcx
- add %rax, %r10
- adc %r11, %rdx
- add %r8, %r8
- adc %r9, %r9
- adc %r10, %r10
- adc %rdx, %rdx
- adc %r11, %r11
- add %r8, 8(rp)
- adc %r9, 16(rp)
- adc %r10, 24(rp)
- adc %rdx, 32(rp)
- adc %r11, 40(rp)
- FUNC_EXIT()
- ret
-
-L(gt3):
-
-define(`v0', `%r8')
-define(`v1', `%r9')
-define(`w0', `%r10')
-define(`w1', `%r11')
-define(`w2', `%rbx')
-define(`w3', `%rbp')
-define(`un', `%r12')
-define(`n', `%rcx')
-
-define(`X0', `%r13')
-define(`X1', `%r14')
-
-L(do_mul_2):
- mov (up), v0
- push %rbx
- lea (rp,un_param,8), rp C point rp at R[un]
- mov 8(up), %rax
- push %rbp
- lea (up,un_param,8), up C point up right after U's end
- mov %rax, v1
- push %r12
- mov $1, R32(un) C free up rdx
- push %r13
- sub un_param, un
- push %r14
- push un
- mul v0
- mov %rax, (rp,un,8)
- mov 8(up,un,8), %rax
- test $1, R8(un)
- jnz L(m2b1)
-
-L(m2b0):lea 2(un), n
- xor R32(w1), R32(w1) C FIXME
- xor R32(w2), R32(w2) C FIXME
- mov %rdx, w0
- jmp L(m2l0)
-
-L(m2b1):lea 1(un), n
- xor R32(w3), R32(w3) C FIXME
- xor R32(w0), R32(w0) C FIXME
- mov %rdx, w2
- jmp L(m2l1)
-
- ALIGN(32)
-L(m2tp):
-L(m2l0):mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
- mov -8(up,n,8), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, -8(rp,n,8)
- mov %rdx, w0
- adc $0, w0
- mov (up,n,8), %rax
-L(m2l1):mul v0
- add %rax, w2
- mov %rdx, w1
- adc $0, w1
- add w3, w2
- mov (up,n,8), %rax
- adc $0, w1
- mul v1
- mov w2, (rp,n,8)
- add %rax, w0
- mov %rdx, w2
- mov 8(up,n,8), %rax
- adc $0, w2
- add $2, n
- jnc L(m2tp)
-
-L(m2ed):mul v0
- add %rax, w0
- mov %rdx, w3
- adc $0, w3
- mov I(-8(up),-8(up,n,8)), %rax
- mul v1
- add w1, w0
- adc $0, w3
- add %rax, w2
- mov w0, I(-8(rp),-8(rp,n,8))
- adc $0, %rdx
- add w3, w2
- mov w2, I((rp),(rp,n,8))
- adc $0, %rdx
- mov %rdx, I(8(rp),8(rp,n,8))
-
- add $2, un C decrease |un|
-
-L(do_addmul_2):
-L(outer):
- lea 16(rp), rp
- cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1}
- jge L(corner) C FIXME: move to before the lea above
-
- mov -8(up,un,8), v0
- mov (up,un,8), %rax
- mov %rax, v1
- mul v0
- test $1, R8(un)
- jnz L(a1x1)
-
-L(a1x0):mov (rp,un,8), X0
- xor w0, w0
- mov 8(rp,un,8), X1
- add %rax, X0
- mov %rdx, w1
- adc $0, w1
- xor w2, w2
- mov X0, (rp,un,8)
- mov 8(up,un,8), %rax
- test $2, R8(un)
- jnz L(a110)
-
-L(a100):lea 2(un), n C un = 4, 8, 12, ...
- jmp L(lo0)
-
-L(a110):lea (un), n C un = 2, 6, 10, ...
- jmp L(lo2)
-
-L(a1x1):mov (rp,un,8), X1
- xor w2, w2
- mov 8(rp,un,8), X0
- add %rax, X1
- mov %rdx, w3
- adc $0, w3
- xor w0, w0
- mov 8(up,un,8), %rax
- test $2, R8(un)
- jz L(a111)
-
-L(a101):lea 3(un), n C un = 1, 5, 9, ...
- jmp L(lo1)
-
-L(a111):lea 1(un), n C un = 3, 7, 11, ...
- jmp L(lo3)
-
- ALIGN(32)
-L(top): mul v1
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- add w1, X1
- adc $0, w3
- add w2, X0
- adc $0, w0
- mov -16(up,n,8), %rax
-L(lo1): mul v0
- add %rax, X0
- mov %rdx, w1
- adc $0, w1
- mov -16(up,n,8), %rax
- mul v1
- mov X1, -24(rp,n,8)
- mov -8(rp,n,8), X1
- add w3, X0
- adc $0, w1
- mov %rdx, w2
- mov X0, -16(rp,n,8)
- add %rax, X1
- adc $0, w2
- mov -8(up,n,8), %rax
- add w0, X1
- adc $0, w2
-L(lo0): mul v0
- add %rax, X1
- mov %rdx, w3
- adc $0, w3
- mov -8(up,n,8), %rax
- mul v1
- add w1, X1
- mov (rp,n,8), X0
- adc $0, w3
- mov %rdx, w0
- add %rax, X0
- adc $0, w0
- mov (up,n,8), %rax
-L(lo3): mul v0
- add w2, X0
- mov X1, -8(rp,n,8)
- mov %rdx, w1
- adc $0, w0
- add %rax, X0
- adc $0, w1
- mov (up,n,8), %rax
- add w3, X0
- adc $0, w1
- mul v1
- mov 8(rp,n,8), X1
- add %rax, X1
- mov %rdx, w2
- adc $0, w2
- mov 8(up,n,8), %rax
- mov X0, (rp,n,8)
-L(lo2): mul v0
- add w0, X1
- mov %rdx, w3
- adc $0, w2
- add %rax, X1
- mov 8(up,n,8), %rax
- mov 16(rp,n,8), X0
- adc $0, w3
- add $4, n
- jnc L(top)
-
-L(end): mul v1
- add w1, X1
- adc $0, w3
- add w2, %rax
- adc $0, %rdx
- mov X1, I(-8(rp),-24(rp,n,8))
- add w3, %rax
- adc $0, %rdx
- mov %rax, I((rp),-16(rp,n,8))
- mov %rdx, I(8(rp),-8(rp,n,8))
-
- add $2, un C decrease |un|
- jmp L(outer) C loop until a small corner remains
-
-L(corner):
- pop n
- jg L(small_corner)
-
- lea 8(rp), rp
- mov -24(up), v0
- mov -16(up), %rax
- mov %rax, v1
- mul v0
- mov -24(rp), X0
- mov -16(rp), X1
- add %rax, X0
- mov %rdx, w1
- adc $0, w1
- xor w2, w2
- mov X0, -24(rp)
- mov -8(up), %rax
- mul v0
- add $0, X1
- mov %rdx, w3
- adc $0, w2
- add %rax, X1
- mov -8(up), %rax
- adc $0, w3
- mul v1
- add w1, X1
- adc $0, w3
- add w2, %rax
- adc $0, %rdx
- mov X1, -16(rp)
- jmp L(com)
-
-L(small_corner):
- mov -8(rp), w3
- mov -16(up), v0
- mov -8(up), %rax
- mul v0
-L(com): add w3, %rax
- adc $0, %rdx
- mov %rax, -8(rp)
- mov %rdx, (rp)
-
-L(sqr_diag_addlsh1):
- mov -8(up,n,8), %rax
- shl n
- mul %rax
- mov %rax, (rp,n,8)
-
- xor R32(%rbx), R32(%rbx)
- mov 8(rp,n,8), %r8
- mov 16(rp,n,8), %r9
- jmp L(dm)
-
- ALIGN(32)
-L(dtop):add %r8, %r10
- adc %r9, %rax
- mov 8(rp,n,8), %r8
- mov 16(rp,n,8), %r9
- mov %r10, -8(rp,n,8)
- mov %rax, (rp,n,8)
-L(dm): adc %r8, %r8
- adc %r9, %r9
- mov (up,n,4), %rax
- lea (%rdx,%rbx), %r10
- setc R8(%rbx)
- mul %rax
- add $2, n
- js L(dtop)
-
-L(dend):add %r8, %r10
- adc %r9, %rax
- mov %r10, I(-8(rp),-8(rp,n,8))
- mov %rax, I((rp),(rp,n,8))
- adc %rbx, %rdx
- mov %rdx, I(8(rp),8(rp,n,8))
-
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/darwin.m4 b/gmp/mpn/x86_64/darwin.m4
index 6f8ec7893d..9eb0f53723 100644
--- a/gmp/mpn/x86_64/darwin.m4
+++ b/gmp/mpn/x86_64/darwin.m4
@@ -1,41 +1,23 @@
divert(-1)
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
define(`DARWIN')
-define(`LEA',`dnl
-ifdef(`PIC',
- `lea $1(%rip), $2'
-,
- `movabs `$'$1, $2')
-')
-
dnl Usage: CALL(funcname)
dnl
dnl Simply override the definition in x86_64-defs.m4.
@@ -43,39 +25,6 @@ dnl Simply override the definition in x86_64-defs.m4.
define(`CALL',`call GSYM_PREFIX`'$1')
-dnl Usage: JUMPTABSECT
-dnl
-dnl CAUTION: Do not put anything sensible here, like RODATA. That works with
-dnl some Darwin tool chains, but silently breaks with other. (Note that
-dnl putting jump tables in the text segment is a really poor idea for PC many
-dnl processors, since they cannot cache the same thing in both L1D and L2I.)
-
-define(`JUMPTABSECT', `.text')
-
-
-dnl Usage: JMPENT(targlabel,tablabel)
-
-define(`JMPENT',`dnl
-ifdef(`PIC',
- `.set $1_tmp, $1-$2
- .long $1_tmp'
-,
- `.quad $1'
-)')
-
-dnl Target ABI macros. For Darwin we override IFELF (and leave default for
-dnl IFDOS and IFSTD).
-
-define(`IFELF', `')
-
-
-dnl Usage: PROTECT(symbol)
-dnl
-dnl Used for private GMP symbols that should never be overridden by users.
-dnl This can save reloc entries and improve shlib sharing as well as
-dnl application startup times
-
-define(`PROTECT', `.private_extern $1')
-
+define(`JUMPTABSECT', `DATA')
divert`'dnl
diff --git a/gmp/mpn/x86_64/div_qr_1n_pi1.asm b/gmp/mpn/x86_64/div_qr_1n_pi1.asm
deleted file mode 100644
index cb072e979d..0000000000
--- a/gmp/mpn/x86_64/div_qr_1n_pi1.asm
+++ /dev/null
@@ -1,247 +0,0 @@
-dnl x86-64 mpn_div_qr_1n_pi1
-dnl -- Divide an mpn number by a normalized single-limb number,
-dnl using a single-limb inverse.
-
-dnl Contributed to the GNU project by Niels Möller
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C c/l
-C AMD K8,K9 13
-C AMD K10 13
-C AMD bull 16.5
-C AMD pile 15
-C AMD steam ?
-C AMD bobcat 16
-C AMD jaguar ?
-C Intel P4 47 poor
-C Intel core 19.25
-C Intel NHM 18
-C Intel SBR 15 poor
-C Intel IBR 13
-C Intel HWL 11.7
-C Intel BWL ?
-C Intel atom 52 very poor
-C VIA nano 19
-
-
-C INPUT Parameters
-define(`QP', `%rdi')
-define(`UP', `%rsi')
-define(`UN_INPUT', `%rdx')
-define(`U1', `%rcx') C Also in %rax
-define(`D', `%r8')
-define(`DINV', `%r9')
-
-C Invariants
-define(`B2', `%rbp')
-define(`B2md', `%rbx')
-
-C Variables
-define(`UN', `%r8') C Overlaps D input
-define(`T', `%r10')
-define(`U0', `%r11')
-define(`U2', `%r12')
-define(`Q0', `%r13')
-define(`Q1', `%r14')
-define(`Q2', `%r15')
-
-ABI_SUPPORT(STD64)
-
- ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_div_qr_1n_pi1)
- FUNC_ENTRY(6)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- dec UN_INPUT
- jnz L(first)
-
- C Just a single 2/1 division.
- C T, U0 are allocated in scratch registers
- lea 1(U1), T
- mov U1, %rax
- mul DINV
- mov (UP), U0
- add U0, %rax
- adc T, %rdx
- mov %rdx, T
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(single_div_done)
- sub D, %rax
- add $1, T
-L(single_div_done):
- mov T, (QP)
- FUNC_EXIT
- ret
-L(first):
- C FIXME: Could delay some of these until we enter the loop.
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
- push %rbp
-
- mov D, B2
- imul DINV, B2
- neg B2
- mov B2, B2md
- sub D, B2md
-
- C D not needed until final reduction
- push D
- mov UN_INPUT, UN C Clobbers D
-
- mov DINV, %rax
- mul U1
- mov %rax, Q0
- add U1, %rdx
- mov %rdx, T
-
- mov B2, %rax
- mul U1
- mov -8(UP, UN, 8), U0
- mov (UP, UN, 8), U1
- mov T, (QP, UN, 8)
- add %rax, U0
- adc %rdx, U1
- sbb U2, U2
- dec UN
- mov U1, %rax
- jz L(final)
-
- ALIGN(16)
-
- C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
- C At entry, %rax holds an extra copy of U1
-L(loop):
- C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
- C Remains to add in B (U1 + c)
- mov DINV, Q1
- mov U2, Q2
- and U2, Q1
- neg Q2
- mul DINV
- add %rdx, Q1
- adc $0, Q2
- add Q0, Q1
- mov %rax, Q0
- mov B2, %rax
- lea (B2md, U0), T
- adc $0, Q2
-
- C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
- mul U1
- and B2, U2
- add U2, U0
- cmovnc U0, T
-
- C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
- adc U1, Q1
- mov -8(UP, UN, 8), U0
- adc Q2, 8(QP, UN, 8)
- jc L(q_incr)
-L(q_incr_done):
- add %rax, U0
- mov T, %rax
- adc %rdx, %rax
- mov Q1, (QP, UN, 8)
- sbb U2, U2
- dec UN
- mov %rax, U1
- jnz L(loop)
-
-L(final):
- pop D
-
- mov U2, Q1
- and D, U2
- sub U2, %rax
- neg Q1
-
- mov %rax, U1
- sub D, %rax
- cmovc U1, %rax
- sbb $-1, Q1
-
- lea 1(%rax), T
- mul DINV
- add U0, %rax
- adc T, %rdx
- mov %rdx, T
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(div_done)
- sub D, %rax
- add $1, T
-L(div_done):
- add T, Q0
- mov Q0, (QP)
- adc Q1, 8(QP)
- jnc L(done)
-L(final_q_incr):
- addq $1, 16(QP)
- lea 8(QP), QP
- jc L(final_q_incr)
-
-L(done):
- pop %rbp
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- FUNC_EXIT
- ret
-
-L(q_incr):
- C U1 is not live, so use it for indexing
- lea 16(QP, UN, 8), U1
-L(q_incr_loop):
- addq $1, (U1)
- jnc L(q_incr_done)
- lea 8(U1), U1
- jmp L(q_incr_loop)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/div_qr_2n_pi1.asm b/gmp/mpn/x86_64/div_qr_2n_pi1.asm
deleted file mode 100644
index 5e59a0ac5d..0000000000
--- a/gmp/mpn/x86_64/div_qr_2n_pi1.asm
+++ /dev/null
@@ -1,158 +0,0 @@
-dnl x86-64 mpn_div_qr_2n_pi1
-dnl -- Divide an mpn number by a normalized 2-limb number,
-dnl using a single-limb inverse.
-
-dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C c/l
-C INPUT PARAMETERS
-define(`qp', `%rdi')
-define(`rp', `%rsi')
-define(`up_param', `%rdx')
-define(`un', `%rcx')
-define(`d1', `%r8')
-define(`d0', `%r9')
-define(`di_param', `8(%rsp)')
-
-define(`di', `%r10')
-define(`up', `%r11')
-define(`u2', `%rbx')
-define(`u1', `%r12')
-define(`t1', `%r13')
-define(`t0', `%r14')
-define(`md1', `%r15')
-
-C TODO
-C * Store qh in the same stack slot as di_param, instead of pushing
-C it. (we could put it in register %rbp, but then we would need to
-C save and restore that instead, which doesn't seem like a win).
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_div_qr_2n_pi1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
-IFDOS(`define(`di_param', `72(%rsp)')')
- mov di_param, di
- mov up_param, up
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
-
- mov -16(up, un, 8), u1
- mov -8(up, un, 8), u2
-
- mov u1, t0
- mov u2, t1
- sub d0, t0
- sbb d1, t1
- cmovnc t0, u1
- cmovnc t1, u2
- C push qh which is !carry
- sbb %rax, %rax
- inc %rax
- push %rax
- lea -2(un), un
- mov d1, md1
- neg md1
-
- jmp L(next)
-
- ALIGN(16)
-L(loop):
- C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
- C Based on the optimized divrem_2.asm code.
-
- mov di, %rax
- mul u2
- mov u1, t0
- add %rax, t0 C q0 in t0
- adc u2, %rdx
- mov %rdx, t1 C q in t1
- imul md1, %rdx
- mov d0, %rax
- lea (%rdx, u1), u2
- mul t1
- mov (up, un, 8), u1
- sub d0, u1
- sbb d1, u2
- sub %rax, u1
- sbb %rdx, u2
- xor R32(%rax), R32(%rax)
- xor R32(%rdx), R32(%rdx)
- cmp t0, u2
- cmovnc d0, %rax
- cmovnc d1, %rdx
- adc $0, t1
- nop
- add %rax, u1
- adc %rdx, u2
- cmp d1, u2
- jae L(fix)
-L(bck):
- mov t1, (qp, un, 8)
-L(next):
- sub $1, un
- jnc L(loop)
-L(end):
- mov u2, 8(rp)
- mov u1, (rp)
-
- C qh on stack
- pop %rax
-
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- FUNC_EXIT()
- ret
-
-L(fix): C Unlikely update. u2 >= d1
- seta %dl
- cmp d0, u1
- setae %al
- orb %dl, %al C "orb" form to placate Sun tools
- je L(bck)
- inc t1
- sub d0, u1
- sbb d1, u2
- jmp L(bck)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/div_qr_2u_pi1.asm b/gmp/mpn/x86_64/div_qr_2u_pi1.asm
deleted file mode 100644
index 85af96fbf6..0000000000
--- a/gmp/mpn/x86_64/div_qr_2u_pi1.asm
+++ /dev/null
@@ -1,200 +0,0 @@
-dnl x86-64 mpn_div_qr_2u_pi1
-dnl -- Divide an mpn number by an unnormalized 2-limb number,
-dnl using a single-limb inverse and shifting the dividend on the fly.
-
-dnl Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C c/l
-C INPUT PARAMETERS
-define(`qp', `%rdi')
-define(`rp', `%rsi')
-define(`up_param', `%rdx')
-define(`un_param', `%rcx') dnl %rcx needed for shift count
-define(`d1', `%r8')
-define(`d0', `%r9')
-define(`shift_param', `FRAME+8(%rsp)')
-define(`di_param', `FRAME+16(%rsp)')
-
-define(`di', `%r10')
-define(`up', `%r11')
-define(`un', `%rbp')
-define(`u2', `%rbx')
-define(`u1', `%r12')
-define(`u0', `%rsi') dnl Same as rp, which is saved and restored.
-define(`t1', `%r13')
-define(`t0', `%r14')
-define(`md1', `%r15')
-
-ASM_START()
- TEXT
- ALIGN(16)
-deflit(`FRAME', 0)
-PROLOGUE(mpn_div_qr_2u_pi1)
- mov di_param, di
- mov up_param, up
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
- push %rbp
- push rp
-deflit(`FRAME', 56)
- lea -2(un_param), un
- mov d1, md1
- neg md1
-
- C int parameter, 32 bits only
- movl shift_param, R32(%rcx)
-
- C FIXME: Different code for SHLD_SLOW
-
- xor R32(u2), R32(u2)
- mov 8(up, un, 8), u1
- shld %cl, u1, u2
- C Remains to read (up, un, 8) and shift u1, u0
- C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di)
- mov di, %rax
- mul u2
- mov (up, un, 8), u0
- shld %cl, u0, u1
- mov u1, t0
- add %rax, t0 C q0 in t0
- adc u2, %rdx
- mov %rdx, t1 C q in t1
- imul md1, %rdx
- mov d0, %rax
- lea (%rdx, u1), u2
- mul t1
- mov u0, u1
- shl %cl, u1
- sub d0, u1
- sbb d1, u2
- sub %rax, u1
- sbb %rdx, u2
- xor R32(%rax), R32(%rax)
- xor R32(%rdx), R32(%rdx)
- cmp t0, u2
- cmovnc d0, %rax
- cmovnc d1, %rdx
- adc $0, t1
- nop
- add %rax, u1
- adc %rdx, u2
- cmp d1, u2
- jae L(fix_qh)
-L(bck_qh):
- push t1 C push qh on stack
-
- jmp L(next)
-
- ALIGN(16)
-L(loop):
- C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
- C Based on the optimized divrem_2.asm code.
-
- mov di, %rax
- mul u2
- mov (up, un, 8), u0
- xor R32(t1), R32(t1)
- shld %cl, u0, t1
- or t1, u1
- mov u1, t0
- add %rax, t0 C q0 in t0
- adc u2, %rdx
- mov %rdx, t1 C q in t1
- imul md1, %rdx
- mov d0, %rax
- lea (%rdx, u1), u2
- mul t1
- mov u0, u1
- shl %cl, u1
- sub d0, u1
- sbb d1, u2
- sub %rax, u1
- sbb %rdx, u2
- xor R32(%rax), R32(%rax)
- xor R32(%rdx), R32(%rdx)
- cmp t0, u2
- cmovnc d0, %rax
- cmovnc d1, %rdx
- adc $0, t1
- nop
- add %rax, u1
- adc %rdx, u2
- cmp d1, u2
- jae L(fix)
-L(bck):
- mov t1, (qp, un, 8)
-L(next):
- sub $1, un
- jnc L(loop)
-L(end):
- C qh on stack
- pop %rax
- pop rp
- shrd %cl, u2, u1
- shr %cl, u2
- mov u2, 8(rp)
- mov u1, (rp)
-
- pop %rbp
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- ret
-
-L(fix): C Unlikely update. u2 >= d1
- seta %dl
- cmp d0, u1
- setae %al
- orb %dl, %al C "orb" form to placate Sun tools
- je L(bck)
- inc t1
- sub d0, u1
- sbb d1, u2
- jmp L(bck)
-
-C Duplicated, just jumping back to a different address.
-L(fix_qh): C Unlikely update. u2 >= d1
- seta %dl
- cmp d0, u1
- setae %al
- orb %dl, %al C "orb" form to placate Sun tools
- je L(bck_qh)
- inc t1
- sub d0, u1
- sbb d1, u2
- jmp L(bck_qh)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/dive_1.asm b/gmp/mpn/x86_64/dive_1.asm
index 988bdab632..4889faccb5 100644
--- a/gmp/mpn/x86_64/dive_1.asm
+++ b/gmp/mpn/x86_64/dive_1.asm
@@ -1,44 +1,31 @@
dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
-dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 10
-C AMD K10 10
-C Intel P4 33
-C Intel core2 13.25
-C Intel corei 14
-C Intel atom 42
-C VIA nano 43
+C K8,K9: 10
+C K10: 10
+C P4: 33
+C P6-15 (Core2):13.25
+C P6-28 (Atom): 42
C A quick adoption of the 32-bit K7 code.
@@ -49,66 +36,67 @@ C up rsi
C n rdx
C divisor rcx
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_divexact_1)
- FUNC_ENTRY(4)
- push %rbx
+ pushq %rbx
- mov %rcx, %rax
- xor R32(%rcx), R32(%rcx) C shift count
- mov %rdx, %r8
+ movq %rcx, %rax
+ movl $0, %ecx C shift count
+ movq %rdx, %r8
- bt $0, R32(%rax)
+ btl $0, %eax
jnc L(evn) C skip bsfq unless divisor is even
-L(odd): mov %rax, %rbx
- shr R32(%rax)
- and $127, R32(%rax) C d/2, 7 bits
+L(odd): movq %rax, %rbx
+ shrl %eax
+ andl $127, %eax C d/2, 7 bits
- LEA( binvert_limb_table, %rdx)
+ifdef(`PIC',`
+ movq binvert_limb_table@GOTPCREL(%rip), %rdx
+',`
+ movabsq $binvert_limb_table, %rdx
+')
- movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+ movzbl (%rax,%rdx), %eax C inv 8 bits
- mov %rbx, %r11 C d without twos
+ movq %rbx, %r11 C d without twos
- lea (%rax,%rax), R32(%rdx) C 2*inv
- imul R32(%rax), R32(%rax) C inv*inv
- imul R32(%rbx), R32(%rax) C inv*inv*d
- sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+ leal (%rax,%rax), %edx C 2*inv
+ imull %eax, %eax C inv*inv
+ imull %ebx, %eax C inv*inv*d
+ subl %eax, %edx C inv = 2*inv - inv*inv*d, 16 bits
- lea (%rdx,%rdx), R32(%rax) C 2*inv
- imul R32(%rdx), R32(%rdx) C inv*inv
- imul R32(%rbx), R32(%rdx) C inv*inv*d
- sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+ leal (%rdx,%rdx), %eax C 2*inv
+ imull %edx, %edx C inv*inv
+ imull %ebx, %edx C inv*inv*d
+ subl %edx, %eax C inv = 2*inv - inv*inv*d, 32 bits
- lea (%rax,%rax), %r10 C 2*inv
- imul %rax, %rax C inv*inv
- imul %rbx, %rax C inv*inv*d
- sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
+ leaq (%rax,%rax), %rdx C 2*inv
+ imulq %rax, %rax C inv*inv
+ imulq %rbx, %rax C inv*inv*d
+ subq %rax, %rdx C inv = 2*inv - inv*inv*d, 64 bits
- lea (%rsi,%r8,8), %rsi C up end
- lea -8(%rdi,%r8,8), %rdi C rp end
- neg %r8 C -n
+ leaq (%rsi,%r8,8), %rsi C up end
+ leaq -8(%rdi,%r8,8), %rdi C rp end
+ negq %r8 C -n
- mov (%rsi,%r8,8), %rax C up[0]
+ movq %rdx, %r10 C final inverse
+ movq (%rsi,%r8,8), %rax C up[0]
- inc %r8
+ incq %r8
jz L(one)
- mov (%rsi,%r8,8), %rdx C up[1]
+ movq (%rsi,%r8,8), %rdx C up[1]
- shrd R8(%rcx), %rdx, %rax
+ shrdq %cl, %rdx, %rax
- xor R32(%rbx), R32(%rbx)
- jmp L(ent)
+ xorl %ebx, %ebx
+ jmp L(entry)
-L(evn): bsf %rax, %rcx
- shr R8(%rcx), %rax
+L(evn): bsfq %rax, %rcx
+ shrq %cl, %rax
jmp L(odd)
ALIGN(8)
@@ -120,39 +108,54 @@ L(top):
C rsi up end
C rdi rp end
C r8 counter, limbs, negative
- C r10 d^(-1) mod 2^64
- C r11 d, shifted down
-
- mul %r11 C carry limb in rdx 0 10
- mov -8(%rsi,%r8,8), %rax C
- mov (%rsi,%r8,8), %r9 C
- shrd R8(%rcx), %r9, %rax C
- nop C
- sub %rbx, %rax C apply carry bit
- setc %bl C
- sub %rdx, %rax C apply carry limb 5
- adc $0, %rbx C 6
-L(ent): imul %r10, %rax C 6
- mov %rax, (%rdi,%r8,8) C
- inc %r8 C
+
+ mulq %r11 C carry limb in rdx
+
+ movq -8(%rsi,%r8,8), %rax
+ movq (%rsi,%r8,8), %r9
+
+ shrdq %cl, %r9, %rax
+ nop
+
+ subq %rbx, %rax C apply carry bit
+ setc %bl
+
+ subq %rdx, %rax C apply carry limb
+ adcq $0, %rbx
+
+L(entry):
+ imulq %r10, %rax
+
+ movq %rax, (%rdi,%r8,8)
+ incq %r8
jnz L(top)
- mul %r11 C carry limb in rdx
- mov -8(%rsi), %rax C up high limb
- shr R8(%rcx), %rax
- sub %rbx, %rax C apply carry bit
- sub %rdx, %rax C apply carry limb
- imul %r10, %rax
- mov %rax, (%rdi)
- pop %rbx
- FUNC_EXIT()
+
+ mulq %r11 C carry limb in rdx
+
+ movq -8(%rsi), %rax C up high limb
+ shrq %cl, %rax
+
+ subq %rbx, %rax C apply carry bit
+
+ subq %rdx, %rax C apply carry limb
+
+ imulq %r10, %rax
+
+ movq %rax, (%rdi)
+
+ popq %rbx
ret
-L(one): shr R8(%rcx), %rax
- imul %r10, %rax
- mov %rax, (%rdi)
- pop %rbx
- FUNC_EXIT()
+
+L(one):
+ shrq %cl, %rax
+
+ imulq %r10, %rax
+
+ movq %rax, (%rdi)
+
+ popq %rbx
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/divrem_1.asm b/gmp/mpn/x86_64/divrem_1.asm
index 91928d9aa3..2f3e95a839 100644
--- a/gmp/mpn/x86_64/divrem_1.asm
+++ b/gmp/mpn/x86_64/divrem_1.asm
@@ -1,44 +1,42 @@
dnl x86-64 mpn_divrem_1 -- mpn by limb division.
-dnl Copyright 2004, 2005, 2007-2012 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C norm unorm frac
-C AMD K8,K9 13 13 12
-C AMD K10 13 13 12
-C Intel P4 43 44 43
-C Intel core2 24.5 24.5 19.5
-C Intel corei 20.5 19.5 18
-C Intel atom 43 46 36
-C VIA nano 25.5 25.5 24
+C K8 13 13 12
+C P4 44.2 44.2 42.3
+C P6-15 (Core2) 24.5 24.5 19.3
+C P6-15 (Atom) 42 52 37
+
+C TODO
+C * Compute the inverse without relying on the div instruction.
+C Newton's method and mulq, or perhaps the faster fdiv.
+C * Tune prologue.
+C * Optimize for Core 2.
+
+C The code for unnormalized divisors works also for normalized divisors, but
+C for some reason it runs really slowly (on K8) for that case. Use special
+C code until we can address this. The Intel Atom is also affected, but
+C understandably (shld slowness).
+define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
C mp_limb_t
C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
@@ -67,20 +65,11 @@ define(`un', `%rbx')
C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
C cnt qp d dinv
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFSTD(`define(`CNTOFF', `40($1)')')
-IFDOS(`define(`CNTOFF', `104($1)')')
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_preinv_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- xor R32(%rax), R32(%rax)
+ xor %eax, %eax
push %r13
push %r12
push %rbp
@@ -95,17 +84,14 @@ IFDOS(` mov 64(%rsp), %r9 ')
test d, d
js L(nent)
-
- mov CNTOFF(%rsp), R8(cnt)
+ mov 40(%rsp), R8(cnt)
shl R8(cnt), d
jmp L(uent)
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_divrem_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- xor R32(%rax), R32(%rax)
+ xor %eax, %eax
push %r13
push %r12
push %rbp
@@ -120,6 +106,8 @@ IFDOS(` mov 56(%rsp), %r8 ')
lea -8(qp,un_param,8), qp
xor R32(%rbp), R32(%rbp)
+
+ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
test d, d
jns L(unnormalized)
@@ -130,54 +118,50 @@ L(normalized):
dec un
mov %rbp, %rax
sub d, %rbp
- cmovc %rax, %rbp
- sbb R32(%rax), R32(%rax)
- inc R32(%rax)
+ cmovb %rax, %rbp
+ sbb %eax, %eax
+ inc %eax
mov %rax, (qp)
lea -8(qp), qp
L(8):
-IFSTD(` push %rdi ')
-IFSTD(` push %rsi ')
- push %r8
-IFSTD(` mov d, %rdi ')
-IFDOS(` mov d, %rcx ')
- CALL( mpn_invert_limb)
- pop %r8
-IFSTD(` pop %rsi ')
-IFSTD(` pop %rdi ')
-
+ mov d, %rdx
+ mov $-1, %rax
+ not %rdx
+ div d C FREE rax rdx rcx r9 r10 r11
mov %rax, dinv
mov %rbp, %rax
jmp L(nent)
ALIGN(16)
-L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4
- mul dinv C 0,13 0,20 0,18 0,45
- add %r10, %rax C 4 8 3 12
- adc %rbp, %rdx C 5 9 10 13
- mov %rax, %rbp C 5 9 4 13
- mov %rdx, %r13 C 6 11 12 23
- imul d, %rdx C 6 11 11 23
- sub %rdx, %r10 C 10 16 14 33
+L(nloop): C cycK8 cycP6 cycP4
+ mov (up,un,8), %r10 C
+ lea 1(%rax), %rbp C
+ mul dinv C 0,13 0,19 0,45
+ add %r10, %rax C 4 8 12
+ adc %rbp, %rdx C 5 9 13
+ mov %rax, %rbp C 5 9 13
+ mov %rdx, %r13 C 6 11 23
+ imul d, %rdx C 6 11 23
+ sub %rdx, %r10 C 10 16 33
mov d, %rax C
- add %r10, %rax C 11 17 15 34
- cmp %rbp, %r10 C 11 17 15 34
- cmovc %r10, %rax C 12 18 16 35
+ add %r10, %rax C 11 17 34
+ cmp %rbp, %r10 C 11 17 34
+ cmovb %r10, %rax C 12 18 35
adc $-1, %r13 C
cmp d, %rax C
jae L(nfx) C
L(nok): mov %r13, (qp) C
sub $8, qp C
-L(nent):lea 1(%rax), %rbp C
- dec un C
- jns L(ntop) C
+L(nent):dec un C
+ jns L(nloop) C
- xor R32(%rcx), R32(%rcx)
+ xor %ecx, %ecx
jmp L(87)
L(nfx): sub d, %rax
inc %r13
jmp L(nok)
+')
L(unnormalized):
test un, un
@@ -192,42 +176,30 @@ L(unnormalized):
dec un
L(44):
bsr d, %rcx
- not R32(%rcx)
- shl R8(%rcx), d
- shl R8(%rcx), %rbp
-
- push %rcx
-IFSTD(` push %rdi ')
-IFSTD(` push %rsi ')
- push %r8
-IFSTD(` mov d, %rdi ')
-IFDOS(` mov d, %rcx ')
- CALL( mpn_invert_limb)
- pop %r8
-IFSTD(` pop %rsi ')
-IFSTD(` pop %rdi ')
- pop %rcx
-
+ not %ecx
+ sal %cl, d
+ sal %cl, %rbp
+ mov d, %rdx
+ mov $-1, %rax
+ not %rdx
+ div d C FREE rax rdx r9 r10 r11
+ test un, un
mov %rax, dinv
mov %rbp, %rax
- test un, un
je L(87)
-
-L(uent):dec un
- mov (up,un,8), %rbp
- neg R32(%rcx)
- shr R8(%rcx), %rbp
- neg R32(%rcx)
- or %rbp, %rax
- jmp L(ent)
+L(uent):
+ mov -8(up,un,8), %rbp
+ shr %cl, %rax
+ shld %cl, %rbp, %rax
+ sub $2, un
+ js L(ulast)
ALIGN(16)
-L(utop):mov (up,un,8), %r10
- shl R8(%rcx), %rbp
- neg R32(%rcx)
- shr R8(%rcx), %r10
- neg R32(%rcx)
- or %r10, %rbp
+L(uloop):
+ nop
+ mov (up,un,8), %r10
+ lea 1(%rax), %r11
+ shld %cl, %r10, %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
@@ -238,18 +210,18 @@ L(utop):mov (up,un,8), %r10
mov d, %rax
add %rbp, %rax
cmp %r11, %rbp
- cmovc %rbp, %rax
+ cmovb %rbp, %rax
adc $-1, %r13
cmp d, %rax
jae L(ufx)
L(uok): mov %r13, (qp)
sub $8, qp
-L(ent): mov (up,un,8), %rbp
dec un
+ mov %r10, %rbp
+ jns L(uloop)
+L(ulast):
lea 1(%rax), %r11
- jns L(utop)
-
-L(uend):shl R8(%rcx), %rbp
+ sal %cl, %rbp
mul dinv
add %rbp, %rax
adc %r11, %rdx
@@ -260,47 +232,48 @@ L(uend):shl R8(%rcx), %rbp
mov d, %rax
add %rbp, %rax
cmp %r11, %rbp
- cmovc %rbp, %rax
+ cmovb %rbp, %rax
adc $-1, %r13
cmp d, %rax
- jae L(efx)
-L(eok): mov %r13, (qp)
+ jae L(93)
+L(69): mov %r13, (qp)
sub $8, qp
jmp L(87)
L(ufx): sub d, %rax
inc %r13
jmp L(uok)
-L(efx): sub d, %rax
+
+L(93): sub d, %rax
inc %r13
- jmp L(eok)
+ jmp L(69)
L(87): mov d, %rbp
neg %rbp
- jmp L(fent)
-
- ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
-L(ftop):mul dinv C 0,12 0,17 0,17
- add %r11, %rdx C 5 8 10
- mov %rax, %r11 C 4 8 3
- mov %rdx, %r13 C 6 9 11
- imul %rbp, %rdx C 6 9 11
+ jmp L(87b)
+
+ ALIGN(16)
+L(floop): C cycK8 cycP6 cycP4
+ lea 1(%rax), %r11 C
+ mul dinv C 0,12
+ add %r11, %rdx C 5
+ mov %rax, %r11 C 4
+ mov %rdx, %r13 C 6
+ imul %rbp, %rdx C 6
mov d, %rax C
- add %rdx, %rax C 10 14 14
- cmp %r11, %rdx C 10 14 14
- cmovc %rdx, %rax C 11 15 15
+ add %rdx, %rax C 10
+ cmp %r11, %rdx C 10
+ cmovb %rdx, %rax C 11
adc $-1, %r13 C
mov %r13, (qp) C
sub $8, qp C
-L(fent):lea 1(%rax), %r11 C
- dec fn C
- jns L(ftop) C
+L(87b): dec fn C
+ jns L(floop) C
- shr R8(%rcx), %rax
+ shr %cl, %rax
L(ret): pop %rbx
pop %rbp
pop %r12
pop %r13
- FUNC_EXIT()
ret
-EPILOGUE()
+EPILOGUE(mpn_divrem_1)
diff --git a/gmp/mpn/x86_64/divrem_2.asm b/gmp/mpn/x86_64/divrem_2.asm
index 66c2da1a05..37053ba88d 100644
--- a/gmp/mpn/x86_64/divrem_2.asm
+++ b/gmp/mpn/x86_64/divrem_2.asm
@@ -1,52 +1,37 @@
dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
-dnl Copyright 2007, 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb best
-C AMD K8,K9 18
-C AMD K10 18
-C AMD bull
-C AMD pile
-C AMD bobcat
-C AMD jaguar
-C Intel P4 68
-C Intel core 34
-C Intel NHM 30.25
-C Intel SBR 21.3
-C Intel IBR 21.4
-C Intel HWL 20.6
-C Intel BWL
-C Intel atom 73
-C VIA nano 33
+C norm frac
+C K8 20 20
+C P4 73 73
+C P6-15 37 37
+
+C TODO
+C * Perhaps compute the inverse without relying on divq? Could either use
+C Newton's method and mulq, or perhaps the faster fdiv.
+C * The loop has not been carefully tuned, nor analysed for critical path
+C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for
+C mpn_divrem_1.
+C * Clean up. This code is really crude.
C INPUT PARAMETERS
@@ -56,117 +41,168 @@ define(`up_param', `%rdx')
define(`un_param', `%rcx')
define(`dp', `%r8')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`dinv', `%r9')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C cnt qp d dinv
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_divrem_2)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
+
push %r15
+ lea (%rdx,%rcx,8), %rax
push %r14
push %r13
- push %r12
- lea -24(%rdx,%rcx,8), %r12 C r12 = &up[un-1]
mov %rsi, %r13
+ push %r12
+ lea -24(%rax), %r12
push %rbp
mov %rdi, %rbp
push %rbx
- mov 8(%r8), %r11 C d1
- mov 16(%r12), %rbx
- mov (%r8), %r8 C d0
- mov 8(%r12), %r10
-
+ mov 8(%r8), %r11
+ mov -8(%rax), %r9
+ mov (%r8), %r8
+ mov -16(%rax), %r10
xor R32(%r15), R32(%r15)
- cmp %rbx, %r11
+ cmp %r9, %r11
ja L(2)
setb %dl
cmp %r10, %r8
setbe %al
- orb %al, %dl C "orb" form to placate Sun tools
- je L(2)
- inc R32(%r15)
- sub %r8, %r10
- sbb %r11, %rbx
+ or %al, %dl
+ jne L(23)
L(2):
- lea -3(%rcx,%r13), %r14 C un + fn - 3
- test %r14, %r14
- js L(end)
-
- push %r8
- push %r10
- push %r11
-IFSTD(` mov %r11, %rdi ')
-IFDOS(` mov %r11, %rcx ')
- CALL( mpn_invert_limb)
- pop %r11
- pop %r10
- pop %r8
-
+ lea -3(%rcx,%r13), %rbx C un + fn - 3
+ test %rbx, %rbx
+ js L(6)
+ mov %r11, %rdx
+ mov $-1, %rax
+ not %rdx
+ div %r11
mov %r11, %rdx
mov %rax, %rdi
imul %rax, %rdx
- mov %rdx, %r9
+ mov %rdx, %r14
mul %r8
- xor R32(%rcx), R32(%rcx)
- add %r8, %r9
- adc $-1, %rcx
- add %rdx, %r9
- adc $0, %rcx
- js 2f
-1: dec %rdi
- sub %r11, %r9
- sbb $0, %rcx
- jns 1b
-2:
-
- lea (%rbp,%r14,8), %rbp
+ mov %rdx, %rcx
+ mov $-1, %rdx
+ add %r8, %r14
+ adc $0, %rdx
+ add %rcx, %r14
+ adc $0, %rdx
+ js L(8)
+L(18):
+ dec %rdi
+ sub %r11, %r14
+ sbb $0, %rdx
+ jns L(18)
+L(8):
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C n2 un n1 dinv qp d0 d1 up fn msl
+C n2 un -d1 n1 dinv XX XX
+
+ifdef(`NEW',`
+ lea (%rbp,%rbx,8), %rbp
+ mov %rbx, %rcx C un
+ mov %r9, %rbx
+ mov %rdi, %r9 C di
+ mov %r10, %r14
mov %r11, %rsi
neg %rsi C -d1
-
-C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C n2 un -d1 dinv qp d0 q0 d1 up fn msl
-
ALIGN(16)
-L(top): mov %rdi, %rax C di ncp
- mul %rbx C 0, 17
- mov %r10, %rcx C
- add %rax, %rcx C 4
+L(loop):
+ mov %r9, %rax C di ncp
+ mul %rbx C 0, 18
+ add %r14, %rax C 4
+ mov %rax, %r10 C q0 5
adc %rbx, %rdx C 5
- mov %rdx, %r9 C q 6
+ mov %rdx, %rdi C q 6
imul %rsi, %rdx C 6
mov %r8, %rax C ncp
- lea (%rdx, %r10), %rbx C n1 -= ... 10
- xor R32(%r10), R32(%r10) C
- mul %r9 C 7
- cmp %r14, %r13 C
+ lea (%rdx, %r14), %rbx C n1 -= ... 7
+ mul %rdi C 7
+ xor R32(%r14), R32(%r14) C
+ cmp %rcx, %r13 C
jg L(19) C
- mov (%r12), %r10 C
+ mov (%r12), %r14 C
sub $8, %r12 C
-L(19): sub %r8, %r10 C ncp
- sbb %r11, %rbx C 11
- sub %rax, %r10 C 11
+L(19): sub %r8, %r14 C ncp
+ sbb %r11, %rbx C 9
+ sub %rax, %r14 C 11
sbb %rdx, %rbx C 12
- xor R32(%rax), R32(%rax) C
+ inc %rdi C 7
xor R32(%rdx), R32(%rdx) C
- cmp %rcx, %rbx C 13
- cmovnc %r8, %rax C 14
- cmovnc %r11, %rdx C 14
- adc $0, %r9 C adjust q 14
- nop
- add %rax, %r10 C 15
+ cmp %r10, %rbx C 13
+ mov %r8, %rax C d1 ncp
+ adc $-1, %rdx C mask 14
+ add %rdx, %rdi C q-- 15
+ and %rdx, %rax C d0 or 0 15
+ and %r11, %rdx C d1 or 0 15
+ add %rax, %r14 C 16
adc %rdx, %rbx C 16
- cmp %r11, %rbx C
+ cmp %r11, %rbx C 17
jae L(fix) C
-L(bck): mov %r9, (%rbp) C
+L(bck): mov %rdi, (%rbp) C
sub $8, %rbp C
- dec %r14
- jns L(top)
-
-L(end): mov %r10, 8(%r12)
- mov %rbx, 16(%r12)
+ dec %rcx
+ jns L(loop)
+
+ mov %r14, %r10
+ mov %rbx, %r9
+',`
+ lea (%rbp,%rbx,8), %rbp
+ mov %rbx, %rcx
+ mov %r9, %rax
+ mov %r10, %rsi
+ ALIGN(16)
+L(loop):
+ mov %rax, %r14 C 0, 19
+ mul %rdi C 0
+ mov %r11, %r9 C 1
+ add %rsi, %rax C 4
+ mov %rax, %rbx C q0 5
+ adc %r14, %rdx C q 5
+ lea 1(%rdx), %r10 C 6
+ mov %rdx, %rax C 6
+ imul %rdx, %r9 C 6
+ sub %r9, %rsi C 10
+ xor R32(%r9), R32(%r9) C
+ mul %r8 C 7
+ cmp %rcx, %r13 C
+ jg L(13) C
+ mov (%r12), %r9 C
+ sub $8, %r12 C
+L(13): sub %r8, %r9 C ncp
+ sbb %r11, %rsi C 11
+ sub %rax, %r9 C 11
+ sbb %rdx, %rsi C 12
+ cmp %rbx, %rsi C 13
+ sbb %rax, %rax C 14
+ not %rax C 15
+ add %rax, %r10 C 16
+ mov %r8, %rbx C ncp
+ and %rax, %rbx C 16
+ and %r11, %rax C 16
+ add %rbx, %r9 C 17
+ adc %rsi, %rax C 18
+ cmp %rax, %r11 C 19
+ jbe L(fix) C
+L(bck): mov %r10, (%rbp) C
+ sub $8, %rbp C
+ mov %r9, %rsi C 18
+ dec %rcx
+ jns L(loop)
+
+ mov %rsi, %r10
+ mov %rax, %r9
+')
+L(6):
+ mov %r10, 8(%r12)
+ mov %r9, 16(%r12)
pop %rbx
pop %rbp
pop %r12
@@ -174,16 +210,30 @@ L(end): mov %r10, 8(%r12)
pop %r14
mov %r15, %rax
pop %r15
- FUNC_EXIT()
ret
+L(23): inc R32(%r15)
+ sub %r8, %r10
+ sbb %r11, %r9
+ jmp L(2)
+
+ifdef(`NEW',`
L(fix): seta %dl
- cmp %r8, %r10
+ cmp %r8, %r14
setae %al
- orb %dl, %al C "orb" form to placate Sun tools
+ orb %dl, %al
je L(bck)
- inc %r9
- sub %r8, %r10
+ inc %rdi
+ sub %r8, %r14
sbb %r11, %rbx
jmp L(bck)
+',`
+L(fix): jb L(88)
+ cmp %r8, %r9
+ jb L(bck)
+L(88): inc %r10
+ sub %r8, %r9
+ sbb %r11, %rax
+ jmp L(bck)
+')
EPILOGUE()
diff --git a/gmp/mpn/x86_64/dos64.m4 b/gmp/mpn/x86_64/dos64.m4
deleted file mode 100644
index 9414623b56..0000000000
--- a/gmp/mpn/x86_64/dos64.m4
+++ /dev/null
@@ -1,100 +0,0 @@
-divert(-1)
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-define(`HOST_DOS64')
-
-
-dnl On DOS64 we always generate position-independent-code
-dnl
-
-define(`PIC')
-
-
-define(`LEA',`
- lea $1(%rip), $2
-')
-
-
-dnl Usage: CALL(funcname)
-dnl
-dnl Simply override the definition in x86_64-defs.m4.
-
-define(`CALL',`call GSYM_PREFIX`'$1')
-
-
-dnl Usage: JUMPTABSECT
-
-define(`JUMPTABSECT', `RODATA')
-
-
-dnl Usage: JMPENT(targlabel,tablabel)
-
-define(`JMPENT', `.long $1-$2')
-
-
-dnl Usage: FUNC_ENTRY(nregparmas)
-dnl Usage: FUNC_EXIT()
-
-dnl FUNC_ENTRY and FUNC_EXIT provide an easy path for adoption of standard
-dnl ABI assembly to the DOS64 ABI.
-
-define(`FUNC_ENTRY',
- `push %rdi
- push %rsi
- mov %rcx, %rdi
-ifelse(eval($1>=2),1,`dnl
- mov %rdx, %rsi
-ifelse(eval($1>=3),1,`dnl
- mov %r8, %rdx
-ifelse(eval($1>=4),1,`dnl
- mov %r9, %rcx
-')')')')
-
-define(`FUNC_EXIT',
- `pop %rsi
- pop %rdi')
-
-
-dnl Target ABI macros. For DOS64 we override the defaults.
-
-define(`IFDOS', `$1')
-define(`IFSTD', `')
-define(`IFELF', `')
-
-
-dnl Usage: PROTECT(symbol)
-dnl
-dnl Used for private GMP symbols that should never be overridden by users.
-dnl This can save reloc entries and improve shlib sharing as well as
-dnl application startup times
-
-define(`PROTECT', `')
-
-
-divert`'dnl
diff --git a/gmp/mpn/x86_64/fastavx/copyd.asm b/gmp/mpn/x86_64/fastavx/copyd.asm
deleted file mode 100644
index 41c55de5ca..0000000000
--- a/gmp/mpn/x86_64/fastavx/copyd.asm
+++ /dev/null
@@ -1,171 +0,0 @@
-dnl AMD64 mpn_copyd optimised for CPUs with fast AVX.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003, 2005, 2007, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb aligned unaligned best seen for cpu?
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile 4.87 4.87 N
-C AMD steam ? ?
-C AMD bobcat n/a
-C AMD jaguar n/a
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR 0.50 0.91 N
-C Intel IBR ?
-C Intel HWL 0.25 0.30 Y
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C We try to do as many 32-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. For the bulk copying, we
-C write using aligned 32-byte operations, but we read with both aligned and
-C unaligned 32-byte operations.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-dnl define(`vmovdqu', vlddqu)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
-
- lea -32(rp,n,8), rp
- lea -32(up,n,8), up
-
- cmp $7, n C basecase needed for correctness
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(a2) C jump if rp aligned
- mov 24(up), %rax
- lea -8(up), up
- mov %rax, 24(rp)
- lea -8(rp), rp
- dec n
-L(a2): test $16, R8(rp) C is rp 32-byte aligned?
- jz L(a3) C jump if rp aligned
- vmovdqu 16(up), %xmm0
- lea -16(up), up
- vmovdqa %xmm0, 16(rp)
- lea -16(rp), rp
- sub $2, n
-L(a3): sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): vmovdqu (up), %ymm0
- vmovdqu -32(up), %ymm1
- vmovdqu -64(up), %ymm2
- vmovdqu -96(up), %ymm3
- lea -128(up), up
- vmovdqa %ymm0, (rp)
- vmovdqa %ymm1, -32(rp)
- vmovdqa %ymm2, -64(rp)
- vmovdqa %ymm3, -96(rp)
- lea -128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- vmovdqu (up), %ymm0
- vmovdqu -32(up), %ymm1
- lea -64(up), up
- vmovdqa %ymm0, (rp)
- vmovdqa %ymm1, -32(rp)
- lea -64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- vmovdqu (up), %ymm0
- lea -32(up), up
- vmovdqa %ymm0, (rp)
- lea -32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- vmovdqu 16(up), %xmm0
- lea -16(up), up
- vmovdqa %xmm0, 16(rp)
- lea -16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov 24(up), %r8
- mov %r8, 24(rp)
-1:
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(bc): test $4, R8(n)
- jz 1f
- mov 24(up), %rax
- mov 16(up), %rcx
- mov 8(up), %r8
- mov (up), %r9
- lea -32(up), up
- mov %rax, 24(rp)
- mov %rcx, 16(rp)
- mov %r8, 8(rp)
- mov %r9, (rp)
- lea -32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- mov 24(up), %rax
- mov 16(up), %rcx
- lea -16(up), up
- mov %rax, 24(rp)
- mov %rcx, 16(rp)
- lea -16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov 24(up), %rax
- mov %rax, 24(rp)
-1:
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastavx/copyi.asm b/gmp/mpn/x86_64/fastavx/copyi.asm
deleted file mode 100644
index 97264ef837..0000000000
--- a/gmp/mpn/x86_64/fastavx/copyi.asm
+++ /dev/null
@@ -1,168 +0,0 @@
-dnl AMD64 mpn_copyi optimised for CPUs with fast AVX.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003, 2005, 2007, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb aligned unaligned best seen for cpu?
-C AMD K8,K9 n/a
-C AMD K10 n/a
-C AMD bull n/a
-C AMD pile 4.87 4.87 N
-C AMD steam ? ?
-C AMD bobcat n/a
-C AMD jaguar n/a
-C Intel P4 n/a
-C Intel core n/a
-C Intel NHM n/a
-C Intel SBR 0.50 0.91 N
-C Intel IBR ?
-C Intel HWL 0.25 0.30 Y
-C Intel BWL ?
-C Intel atom n/a
-C VIA nano n/a
-
-C We try to do as many 32-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. For the bulk copying, we
-C write using aligned 32-byte operations, but we read with both aligned and
-C unaligned 32-byte operations.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-dnl define(`vmovdqu', vlddqu)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
-
- cmp $7, n
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(a2) C jump if rp aligned
- mov (up), %rax
- lea 8(up), up
- mov %rax, (rp)
- lea 8(rp), rp
- dec n
-L(a2): test $16, R8(rp) C is rp 32-byte aligned?
- jz L(a3) C jump if rp aligned
- vmovdqu (up), %xmm0
- lea 16(up), up
- vmovdqa %xmm0, (rp)
- lea 16(rp), rp
- sub $2, n
-L(a3): sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): vmovdqu (up), %ymm0
- vmovdqu 32(up), %ymm1
- vmovdqu 64(up), %ymm2
- vmovdqu 96(up), %ymm3
- lea 128(up), up
- vmovdqa %ymm0, (rp)
- vmovdqa %ymm1, 32(rp)
- vmovdqa %ymm2, 64(rp)
- vmovdqa %ymm3, 96(rp)
- lea 128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- vmovdqu (up), %ymm0
- vmovdqu 32(up), %ymm1
- lea 64(up), up
- vmovdqa %ymm0, (rp)
- vmovdqa %ymm1, 32(rp)
- lea 64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- vmovdqu (up), %ymm0
- lea 32(up), up
- vmovdqa %ymm0, (rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- vmovdqu (up), %xmm0
- lea 16(up), up
- vmovdqa %xmm0, (rp)
- lea 16(rp), rp
-1:
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-1:
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(bc): test $4, R8(n)
- jz 1f
- mov (up), %rax
- mov 8(up), %rcx
- mov 16(up), %r8
- mov 24(up), %r9
- lea 32(up), up
- mov %rax, (rp)
- mov %rcx, 8(rp)
- mov %r8, 16(rp)
- mov %r9, 24(rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- mov (up), %rax
- mov 8(up), %rcx
- lea 16(up), up
- mov %rax, (rp)
- mov %rcx, 8(rp)
- lea 16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov (up), %rax
- mov %rax, (rp)
-1:
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/README b/gmp/mpn/x86_64/fastsse/README
deleted file mode 100644
index 520551ed99..0000000000
--- a/gmp/mpn/x86_64/fastsse/README
+++ /dev/null
@@ -1,21 +0,0 @@
-This directory contains code for x86-64 processors with fast
-implementations of SSE operations, hence the name "fastsse".
-
-Current processors that might benefit from this code are:
-
- AMD K10
- AMD Bulldozer
- Intel Nocona
- Intel Nehalem/Westmere
- Intel Sandybridge/Ivybridge
- VIA Nano
-
-Current processors that do not benefit from this code are:
-
- AMD K8
- AMD Bobcat
- Intel Atom
-
-Intel Conroe/Penryn is a border case; its handling of non-aligned
-128-bit memory operands is poor. VIA Nano also have poor handling of
-non-aligned operands.
diff --git a/gmp/mpn/x86_64/fastsse/com-palignr.asm b/gmp/mpn/x86_64/fastsse/com-palignr.asm
deleted file mode 100644
index d9641e890d..0000000000
--- a/gmp/mpn/x86_64/fastsse/com-palignr.asm
+++ /dev/null
@@ -1,302 +0,0 @@
-dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bd1 1.39 ? 1.45 Y/N
-C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
-C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.82 opt/0.74 Y
-C Intel NHM 0.52 0.65 opt/opt Y
-C Intel SBR 0.51 0.55 opt/0.51 Y
-C Intel atom 1.16 1.70 opt/opt Y
-C VIA nano 1.09 1.10 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
-C instruction is better adapted to mpn_copyd's needs, we need to contort the
-C code to use it here.
-C
-C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
-C from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-define(`movdqa', ``movaps'')
-
-ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_com)
- FUNC_ENTRY(3)
-
- cmp $COM_SSE_THRESHOLD, n
- jbe L(bc)
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(rp_aligned) C jump if rp aligned
-
- mov (up), %r8
- lea 8(up), up
- not %r8
- mov %r8, (rp)
- lea 8(rp), rp
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jnz L(uent)
-
-ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa 0(up), %xmm0
- movdqa 16(up), %xmm1
- movdqa 32(up), %xmm2
- movdqa 48(up), %xmm3
- lea 64(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa (up), %xmm0
- movdqa 16(up), %xmm1
- lea 32(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa (up), %xmm0
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):
-C Code handling up - rp = 8 (mod 16)
-
-C FIXME: The code below only handles overlap if it is close to complete, or
-C quite separate: up-rp < 5 or up-up > 15 limbs
- lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
- sub rp, %rax
- cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
- jbe L(bc) C deflect to plain loop
-
- sub $16, n
- jc L(uend)
-
- movdqa 120(up), %xmm3
-
- sub $16, n
- jmp L(um)
-
- ALIGN(16)
-L(utop):movdqa 120(up), %xmm3
- pxor %xmm7, %xmm0
- movdqa %xmm0, -128(rp)
- sub $16, n
-L(um): movdqa 104(up), %xmm2
- palignr($8, %xmm2, %xmm3)
- movdqa 88(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 112(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 72(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 96(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa 56(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 80(rp)
- palignr($8, %xmm3, %xmm0)
- movdqa 40(up), %xmm2
- pxor %xmm7, %xmm0
- movdqa %xmm0, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 128(up), up
- lea 128(rp), rp
- jnc L(utop)
-
- pxor %xmm7, %xmm0
- movdqa %xmm0, -128(rp)
-
-L(uend):test $8, R8(n)
- jz 1f
- movdqa 56(up), %xmm3
- movdqa 40(up), %xmm2
- palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
- pxor %xmm7, %xmm3
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
- pxor %xmm7, %xmm2
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 64(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movdqa 24(up), %xmm1
- movdqa 8(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- pxor %xmm7, %xmm1
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 32(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa 8(up), %xmm0
- movdqa -8(up), %xmm3
- palignr($8, %xmm3, %xmm0)
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): lea -8(rp), rp
- sub $4, R32(n)
- jc L(end)
-
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` ALIGN(16)')
-L(top): mov (up), %r8
- mov 8(up), %r9
- lea 32(rp), rp
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- not %r8
- not %r9
- not %r10
- not %r11
- mov %r8, -24(rp)
- mov %r9, -16(rp)
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, -8(rp)
- mov %r11, (rp)
-ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- not %r8
- mov %r8, 8(rp)
- lea 8(rp), rp
- lea 8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov 8(up), %r9
- not %r8
- not %r9
- mov %r8, 8(rp)
- mov %r9, 16(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/com.asm b/gmp/mpn/x86_64/fastsse/com.asm
deleted file mode 100644
index 4abb076d3f..0000000000
--- a/gmp/mpn/x86_64/fastsse/com.asm
+++ /dev/null
@@ -1,161 +0,0 @@
-dnl AMD64 mpn_com optimised for CPUs with fast SSE.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 2.0 N
-C AMD K10 0.85 1.3 Y/N
-C AMD bd1 1.40 1.40 Y
-C AMD bobcat 3.1 3.1 N
-C Intel P4 2.28 illop Y
-C Intel core2 1.02 1.02 N
-C Intel NHM 0.53 0.68 Y
-C Intel SBR 0.51 0.75 Y
-C Intel atom 3.68 3.68 N
-C VIA nano 1.17 5.09 Y/N
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_com)
- FUNC_ENTRY(3)
-
- test n, n
- jz L(don)
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- mov (up), %rax
- lea 8(up), up
- not %rax
- mov %rax, (rp)
- lea 8(rp), rp
- dec n
-
- sub $14, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- movdqu 64(up), %xmm4
- movdqu 80(up), %xmm5
- movdqu 96(up), %xmm6
- lea 112(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- pxor %xmm7, %xmm4
- pxor %xmm7, %xmm5
- pxor %xmm7, %xmm6
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- movdqa %xmm4, 64(rp)
- movdqa %xmm5, 80(rp)
- movdqa %xmm6, 96(rp)
- lea 112(rp), rp
-L(ali): sub $14, n
- jnc L(top)
-
-L(sma): add $14, n
- test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- lea 64(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- pxor %xmm7, %xmm2
- pxor %xmm7, %xmm3
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- lea 32(up), up
- pxor %xmm7, %xmm0
- pxor %xmm7, %xmm1
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea 16(up), up
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov (up), %rax
- not %rax
- mov %rax, (rp)
-1:
-L(don): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp/mpn/x86_64/fastsse/copyd-palignr.asm
deleted file mode 100644
index 7430cadc09..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyd-palignr.asm
+++ /dev/null
@@ -1,251 +0,0 @@
-dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bull 0.70 0.70 Y
-C AMD pile 0.68 0.68 Y
-C AMD steam ? ?
-C AMD bobcat 1.97 8.24 1.5/1.5 N
-C AMD jaguar ? ?
-C Intel P4 2.26 illop Y/N
-C Intel core 0.52 0.68-0.80 opt/0.64 Y
-C Intel NHM 0.52 0.64 opt/opt Y
-C Intel SBR 0.51 0.51 opt/0.51 Y
-C Intel IBR ? ? Y
-C Intel HWL 0.51 0.51 0.25/0.25 N
-C Intel atom 1.16 1.66 opt/opt Y
-C VIA nano 1.08 1.06 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
-C
-C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
-C taken from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-define(`movdqa', ``movaps'')
-
-ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
-
- lea -8(up,n,8), up
- lea -8(rp,n,8), rp
-
- cmp $COPYD_SSE_THRESHOLD, n
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jnz L(rp_aligned) C jump if rp aligned
-
- mov (up), %rax C copy one limb
- mov %rax, (rp)
- lea -8(up), up
- lea -8(rp), rp
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jz L(uent)
-
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa -8(up), %xmm0
- movdqa -24(up), %xmm1
- movdqa -40(up), %xmm2
- movdqa -56(up), %xmm3
- lea -64(up), up
- movdqa %xmm0, -8(rp)
- movdqa %xmm1, -24(rp)
- movdqa %xmm2, -40(rp)
- movdqa %xmm3, -56(rp)
- lea -64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa -8(up), %xmm0
- movdqa -24(up), %xmm1
- lea -32(up), up
- movdqa %xmm0, -8(rp)
- movdqa %xmm1, -24(rp)
- lea -32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa -8(up), %xmm0
- lea -16(up), up
- movdqa %xmm0, -8(rp)
- lea -16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):sub $16, n
- movdqa (up), %xmm0
- jc L(uend)
-
- ALIGN(16)
-L(utop):sub $16, n
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm2
- palignr($8, %xmm2, %xmm1)
- movdqa %xmm1, -24(rp)
- movdqa -48(up), %xmm3
- palignr($8, %xmm3, %xmm2)
- movdqa %xmm2, -40(rp)
- movdqa -64(up), %xmm0
- palignr($8, %xmm0, %xmm3)
- movdqa %xmm3, -56(rp)
- movdqa -80(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -72(rp)
- movdqa -96(up), %xmm2
- palignr($8, %xmm2, %xmm1)
- movdqa %xmm1, -88(rp)
- movdqa -112(up), %xmm3
- palignr($8, %xmm3, %xmm2)
- movdqa %xmm2, -104(rp)
- movdqa -128(up), %xmm0
- palignr($8, %xmm0, %xmm3)
- movdqa %xmm3, -120(rp)
- lea -128(up), up
- lea -128(rp), rp
- jnc L(utop)
-
-L(uend):test $8, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -24(rp)
- movdqa -48(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -40(rp)
- movdqa -64(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -56(rp)
- lea -64(up), up
- lea -64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- movdqa -32(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, -24(rp)
- lea -32(up), up
- lea -32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa -16(up), %xmm1
- palignr($8, %xmm1, %xmm0)
- movdqa %xmm0, -8(rp)
- lea -16(up), up
- lea -16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): sub $4, R32(n)
- jc L(end)
-
- ALIGN(16)
-L(top): mov (up), %r8
- mov -8(up), %r9
- lea -32(rp), rp
- mov -16(up), %r10
- mov -24(up), %r11
- lea -32(up), up
- mov %r8, 32(rp)
- mov %r9, 24(rp)
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, 16(rp)
- mov %r11, 8(rp)
-ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
- lea -8(rp), rp
- lea -8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov -8(up), %r9
- mov %r8, (rp)
- mov %r9, -8(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyd.asm b/gmp/mpn/x86_64/fastsse/copyd.asm
deleted file mode 100644
index 5c6094c7e2..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyd.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-dnl AMD64 mpn_copyd optimised for CPUs with fast SSE.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb good for cpu?
-C AMD K8,K9
-C AMD K10 0.85 Y
-C AMD bd1 0.8 Y
-C AMD bobcat
-C Intel P4 2.28 Y
-C Intel core2 1
-C Intel NHM 0.5 Y
-C Intel SBR 0.5 Y
-C Intel atom
-C VIA nano 1.1 Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_copyd)
- FUNC_ENTRY(3)
-
- test n, n
- jz L(don)
-
- lea -16(rp,n,8), rp
- lea -16(up,n,8), up
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- mov 8(up), %rax
- lea -8(up), up
- mov %rax, 8(rp)
- lea -8(rp), rp
- dec n
-
- sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- movdqu -32(up), %xmm2
- movdqu -48(up), %xmm3
- movdqu -64(up), %xmm4
- movdqu -80(up), %xmm5
- movdqu -96(up), %xmm6
- movdqu -112(up), %xmm7
- lea -128(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- movdqa %xmm2, -32(rp)
- movdqa %xmm3, -48(rp)
- movdqa %xmm4, -64(rp)
- movdqa %xmm5, -80(rp)
- movdqa %xmm6, -96(rp)
- movdqa %xmm7, -112(rp)
- lea -128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- movdqu -32(up), %xmm2
- movdqu -48(up), %xmm3
- lea -64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- movdqa %xmm2, -32(rp)
- movdqa %xmm3, -48(rp)
- lea -64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu -16(up), %xmm1
- lea -32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, -16(rp)
- lea -32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea -16(up), up
- movdqa %xmm0, (rp)
- lea -16(rp), rp
-1:
- test $1, R8(n)
- jz 1f
- mov 8(up), %r8
- mov %r8, 8(rp)
-1:
-L(don): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp/mpn/x86_64/fastsse/copyi-palignr.asm
deleted file mode 100644
index fda3c3500f..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyi-palignr.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 2.0 illop 1.0/1.0 N
-C AMD K10 0.85 illop Y/N
-C AMD bull 0.70 0.66 Y
-C AMD pile 0.68 0.66 Y
-C AMD steam ? ?
-C AMD bobcat 1.97 8.16 1.5/1.5 N
-C AMD jaguar ? ?
-C Intel P4 2.26 illop Y/N
-C Intel core 0.52 0.64 opt/opt Y
-C Intel NHM 0.52 0.71 opt/opt Y
-C Intel SBR 0.51 0.54 opt/0.51 Y
-C Intel IBR ? ? Y
-C Intel HWL 0.51 0.52 0.25/0.25 N
-C Intel atom 1.16 1.61 opt/opt Y
-C VIA nano 1.09 1.08 opt/opt Y
-
-C We use only 16-byte operations, except for unaligned top-most and bottom-most
-C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
-C instruction is better adapted to mpn_copyd's needs, we need to contort the
-C code to use it here.
-C
-C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
-C taken from the x86_64 default code.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-C There are three instructions for loading an aligned 128-bit quantity. We use
-C movaps, since it has the shortest coding.
-dnl define(`movdqa', ``movaps'')
-
-ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
-
- cmp $COPYI_SSE_THRESHOLD, n
- jbe L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(rp_aligned) C jump if rp aligned
-
- movsq C copy one limb
- dec n
-
-L(rp_aligned):
- test $8, R8(up)
- jnz L(uent)
-
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` sub $8, n',
-` jmp L(am)')
-
- ALIGN(16)
-L(atop):movdqa 0(up), %xmm0
- movdqa 16(up), %xmm1
- movdqa 32(up), %xmm2
- movdqa 48(up), %xmm3
- lea 64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-L(am): sub $8, n
- jnc L(atop)
-
- test $4, R8(n)
- jz 1f
- movdqa (up), %xmm0
- movdqa 16(up), %xmm1
- lea 32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa (up), %xmm0
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-L(uent):
-C Code handling up - rp = 8 (mod 16)
-
- cmp $16, n
- jc L(ued0)
-
-IFDOS(` add $-56, %rsp ')
-IFDOS(` movdqa %xmm6, (%rsp) ')
-IFDOS(` movdqa %xmm7, 16(%rsp) ')
-IFDOS(` movdqa %xmm8, 32(%rsp) ')
-
- movaps 120(up), %xmm7
- movaps 104(up), %xmm6
- movaps 88(up), %xmm5
- movaps 72(up), %xmm4
- movaps 56(up), %xmm3
- movaps 40(up), %xmm2
- lea 128(up), up
- sub $32, n
- jc L(ued1)
-
- ALIGN(16)
-L(utop):movaps -104(up), %xmm1
- sub $16, n
- movaps -120(up), %xmm0
- palignr($8, %xmm6, %xmm7)
- movaps -136(up), %xmm8
- movdqa %xmm7, 112(rp)
- palignr($8, %xmm5, %xmm6)
- movaps 120(up), %xmm7
- movdqa %xmm6, 96(rp)
- palignr($8, %xmm4, %xmm5)
- movaps 104(up), %xmm6
- movdqa %xmm5, 80(rp)
- palignr($8, %xmm3, %xmm4)
- movaps 88(up), %xmm5
- movdqa %xmm4, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movaps 72(up), %xmm4
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movaps 56(up), %xmm3
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movaps 40(up), %xmm2
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm8, %xmm0)
- lea 128(up), up
- movdqa %xmm0, (rp)
- lea 128(rp), rp
- jnc L(utop)
-
-L(ued1):movaps -104(up), %xmm1
- movaps -120(up), %xmm0
- movaps -136(up), %xmm8
- palignr($8, %xmm6, %xmm7)
- movdqa %xmm7, 112(rp)
- palignr($8, %xmm5, %xmm6)
- movdqa %xmm6, 96(rp)
- palignr($8, %xmm4, %xmm5)
- movdqa %xmm5, 80(rp)
- palignr($8, %xmm3, %xmm4)
- movdqa %xmm4, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm8, %xmm0)
- movdqa %xmm0, (rp)
- lea 128(rp), rp
-
-IFDOS(` movdqa (%rsp), %xmm6 ')
-IFDOS(` movdqa 16(%rsp), %xmm7 ')
-IFDOS(` movdqa 32(%rsp), %xmm8 ')
-IFDOS(` add $56, %rsp ')
-
-L(ued0):test $8, R8(n)
- jz 1f
- movaps 56(up), %xmm3
- movaps 40(up), %xmm2
- movaps 24(up), %xmm1
- movaps 8(up), %xmm0
- movaps -8(up), %xmm4
- palignr($8, %xmm2, %xmm3)
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm4, %xmm0)
- lea 64(up), up
- movdqa %xmm0, (rp)
- lea 64(rp), rp
-
-1: test $4, R8(n)
- jz 1f
- movaps 24(up), %xmm1
- movaps 8(up), %xmm0
- palignr($8, %xmm0, %xmm1)
- movaps -8(up), %xmm3
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
- lea 32(up), up
- movdqa %xmm0, (rp)
- lea 32(rp), rp
-
-1: test $2, R8(n)
- jz 1f
- movdqa 8(up), %xmm0
- movdqa -8(up), %xmm3
- palignr($8, %xmm3, %xmm0)
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
-
-1: test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-
-1: FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): lea -8(rp), rp
- sub $4, R32(n)
- jc L(end)
-
- ALIGN(16)
-L(top): mov (up), %r8
- mov 8(up), %r9
- lea 32(rp), rp
- mov 16(up), %r10
- mov 24(up), %r11
- lea 32(up), up
- mov %r8, -24(rp)
- mov %r9, -16(rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` sub $4, R32(n)')
- mov %r10, -8(rp)
- mov %r11, (rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
-` jnc L(top)')
-
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, 8(rp)
- lea 8(rp), rp
- lea 8(up), up
-1: test $2, R8(n)
- jz 1f
- mov (up), %r8
- mov 8(up), %r9
- mov %r8, 8(rp)
- mov %r9, 16(rp)
-1: FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/copyi.asm b/gmp/mpn/x86_64/fastsse/copyi.asm
deleted file mode 100644
index a1a1c231dc..0000000000
--- a/gmp/mpn/x86_64/fastsse/copyi.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl AMD64 mpn_copyi optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb good for cpu?
-C AMD K8,K9
-C AMD K10 0.85 1.64 Y/N
-C AMD bd1 1.4 1.4 Y
-C AMD bobcat
-C Intel P4 2.3 2.3 Y
-C Intel core2 1.0 1.0
-C Intel NHM 0.5 0.67 Y
-C Intel SBR 0.5 0.75 Y
-C Intel atom
-C VIA nano 1.16 5.16 Y/N
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We can always write using
-C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
-C operations.
-
-C Instead of having separate loops for reading aligned and unaligned, we read
-C using MOVDQU. This seems to work great except for core2; there performance
-C doubles when reading using MOVDQA (for aligned source). It is unclear how to
-C best handle the unaligned case there.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-dnl define(`movdqu', lddqu)
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_copyi)
- FUNC_ENTRY(3)
-
- cmp $3, n
- jc L(bc)
-
- test $8, R8(rp) C is rp 16-byte aligned?
- jz L(ali) C jump if rp aligned
- movsq C copy single limb
- dec n
-
- sub $16, n
- jc L(sma)
-
- ALIGN(16)
-L(top): movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- movdqu 64(up), %xmm4
- movdqu 80(up), %xmm5
- movdqu 96(up), %xmm6
- movdqu 112(up), %xmm7
- lea 128(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- movdqa %xmm4, 64(rp)
- movdqa %xmm5, 80(rp)
- movdqa %xmm6, 96(rp)
- movdqa %xmm7, 112(rp)
- lea 128(rp), rp
-L(ali): sub $16, n
- jnc L(top)
-
-L(sma): test $8, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- movdqu 32(up), %xmm2
- movdqu 48(up), %xmm3
- lea 64(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- movdqa %xmm2, 32(rp)
- movdqa %xmm3, 48(rp)
- lea 64(rp), rp
-1:
- test $4, R8(n)
- jz 1f
- movdqu (up), %xmm0
- movdqu 16(up), %xmm1
- lea 32(up), up
- movdqa %xmm0, (rp)
- movdqa %xmm1, 16(rp)
- lea 32(rp), rp
-1:
- test $2, R8(n)
- jz 1f
- movdqu (up), %xmm0
- lea 16(up), up
- movdqa %xmm0, (rp)
- lea 16(rp), rp
- ALIGN(16)
-1:
-L(end): test $1, R8(n)
- jz 1f
- mov (up), %r8
- mov %r8, (rp)
-1:
- FUNC_EXIT()
- ret
-
-C Basecase code. Needed for good small operands speed, not for
-C correctness as the above code is currently written.
-
-L(bc): sub $2, n
- jc L(end)
- ALIGN(16)
-1: mov (up), %rax
- mov 8(up), %rcx
- lea 16(up), up
- mov %rax, (rp)
- mov %rcx, 8(rp)
- lea 16(rp), rp
- sub $2, n
- jnc 1b
-
- test $1, R8(n)
- jz L(ret)
- mov (up), %rax
- mov %rax, (rp)
-L(ret): FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm
deleted file mode 100644
index a05e850a1f..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm
+++ /dev/null
@@ -1,182 +0,0 @@
-dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 2.35 no, use shl/shr
-C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
-C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
-C AMD bobcat 3.17 3.17 yes, bad for n < 20
-C Intel P4 4.67 4.67 2.7 no, slow movdqu
-C Intel core2 2.15 2.15 1.25 no, use shld/shrd
-C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
-C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
-C Intel atom 11.7 11.7 4.5 no
-C VIA nano 5.7 5.95 2.0 no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- cmp $3, n
- jle L(bc)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
-
- and $6, R32(%r8)
- jz L(ba0)
- cmp $4, R32(%r8)
- jz L(ba4)
- jc L(ba2)
-L(ba6): add $-4, n
- jmp L(i56)
-L(ba0): add $-6, n
- jmp L(i70)
-L(ba4): add $-2, n
- jmp L(i34)
-L(ba2): add $-8, n
- jle L(end)
-
- ALIGN(16)
-L(top): movdqu 40(ap,n,8), %xmm1
- movdqu 48(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 48(rp,n,8)
-L(i70):
- movdqu 24(ap,n,8), %xmm1
- movdqu 32(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 32(rp,n,8)
-L(i56):
- movdqu 8(ap,n,8), %xmm1
- movdqu 16(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, 16(rp,n,8)
-L(i34):
- movdqu -8(ap,n,8), %xmm1
- movdqu (ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
- sub $8, n
- jg L(top)
-
-L(end): test $1, R8(n)
- jnz L(end8)
-
- movdqu (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jz L(end8)
-
- movq (ap,n,8), %xmm1
- movq -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, (rp,n,8)
- sub $2, R32(n)
- jl L(end8)
- movq 8(ap), %xmm1
- movq (ap), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshift.asm b/gmp/mpn/x86_64/fastsse/lshift.asm
deleted file mode 100644
index f76972a22f..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshift.asm
+++ /dev/null
@@ -1,169 +0,0 @@
-dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb good
-C 16-byte aligned 16-byte unaligned for cpu?
-C AMD K8,K9 ? ?
-C AMD K10 1.68 (1.45) 1.75 (1.49) Y
-C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
-C AMD bobcat 4 4
-C Intel P4 3 (2.7) 3 (2.7) Y
-C Intel core2 2.05 (1.67) 2.55 (1.75)
-C Intel NHM 2.05 (1.75) 2.09 (2)
-C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
-C Intel atom ? ?
-C VIA nano 2.25 (2) 2.5 (2) Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations.
-
-C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
-C not true. The aligned case reads 16+8 bytes, the unaligned case reads
-C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
-
-C This is not yet great code:
-C (1) The unaligned case makes many reads.
-C (2) We should do some unrolling, at least 2-way.
-C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
-C Nano.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshift)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- cmp $2, n
- jle L(le2)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea (ap,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(aent)
- jmp L(uent)
-C *****************************************************************************
-
-C Handle the case when ap != rp (mod 16).
-
- ALIGN(16)
-L(utop):movdqa -8(ap,n,8), %xmm0
- movq (ap,n,8), %xmm1
- punpcklqdq 8(ap,n,8), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(uent):sub $2, n
- ja L(utop)
-
- jne L(end8)
-
- movq (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- punpcklqdq 8(ap), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
-C Handle the case when ap = rp (mod 16).
-
- ALIGN(16)
-L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
- movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
- punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(aent):
- sub $2, n
- ja L(atop)
- jne L(end8)
-
- movdqa (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
- ALIGN(16)
-L(le2): jne L(end8)
-
- movq 8(ap), %xmm0
- movq (ap), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- movq %xmm0, (rp)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
deleted file mode 100644
index 8250910c52..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 ? no, use shl/shr
-C AMD K10 1.8-2.0 1.8-2.0 ? yes
-C AMD bd1 1.9 1.9 ? yes
-C AMD bobcat 3.67 3.67 yes, bad for n < 20
-C Intel P4 4.75 4.75 ? no, slow movdqu
-C Intel core2 2.27 2.27 ? no, use shld/shrd
-C Intel NHM 2.15 2.15 ? no, use shld/shrd
-C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6
-C Intel atom 12.9 12.9 ? no
-C VIA nano 6.18 6.44 ? no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- pcmpeqb %xmm3, %xmm3 C set to 111...111
-
- cmp $3, n
- jle L(bc)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
-
- and $6, R32(%r8)
- jz L(ba0)
- cmp $4, R32(%r8)
- jz L(ba4)
- jc L(ba2)
-L(ba6): add $-4, n
- jmp L(i56)
-L(ba0): add $-6, n
- jmp L(i70)
-L(ba4): add $-2, n
- jmp L(i34)
-L(ba2): add $-8, n
- jle L(end)
-
- ALIGN(16)
-L(top): movdqu 40(ap,n,8), %xmm1
- movdqu 48(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 48(rp,n,8)
-L(i70):
- movdqu 24(ap,n,8), %xmm1
- movdqu 32(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 32(rp,n,8)
-L(i56):
- movdqu 8(ap,n,8), %xmm1
- movdqu 16(ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, 16(rp,n,8)
-L(i34):
- movdqu -8(ap,n,8), %xmm1
- movdqu (ap,n,8), %xmm0
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, (rp,n,8)
- sub $8, n
- jg L(top)
-
-L(end): test $1, R8(n)
- jnz L(end8)
-
- movdqu (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movdqa %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jz L(end8)
-
- movq (ap,n,8), %xmm1
- movq -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, (rp,n,8)
- sub $2, R32(n)
- jl L(end8)
- movq 8(ap), %xmm1
- movq (ap), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- pxor %xmm3, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/lshiftc.asm b/gmp/mpn/x86_64/fastsse/lshiftc.asm
deleted file mode 100644
index d2520690e2..0000000000
--- a/gmp/mpn/x86_64/fastsse/lshiftc.asm
+++ /dev/null
@@ -1,179 +0,0 @@
-dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
-
-dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb good
-C 16-byte aligned 16-byte unaligned for cpu?
-C AMD K8,K9 ? ?
-C AMD K10 1.85 (1.635) 1.9 (1.67) Y
-C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
-C AMD bobcat 4.5 4.5
-C Intel P4 3.6 (3.125) 3.6 (3.125) Y
-C Intel core2 2.05 (1.67) 2.55 (1.75)
-C Intel NHM 2.05 (1.875) 2.6 (2.25)
-C Intel SBR 1.55 (1.44) 2 (1.57) Y
-C Intel atom ? ?
-C VIA nano 2.5 (2.5) 2.5 (2.5) Y
-
-C We try to do as many 16-byte operations as possible. The top-most and
-C bottom-most writes might need 8-byte operations. We always write using
-C 16-byte operations, we read with both 8-byte and 16-byte operations.
-
-C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
-C not true. The aligned case reads 16+8 bytes, the unaligned case reads
-C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
-
-C This is not yet great code:
-C (1) The unaligned case makes too many reads.
-C (2) We should do some unrolling, at least 2-way.
-C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
-C Nano.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_lshiftc)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov -8(ap,n,8), %rax
- shr R8(%rcx), %rax
-
- pcmpeqb %xmm7, %xmm7 C set to 111...111
-
- cmp $2, n
- jle L(le2)
-
- lea (rp,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq -8(ap,n,8), %xmm0
- movq -16(ap,n,8), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, -8(rp,n,8)
- dec n
-
-L(rp_aligned):
- lea (ap,n,8), R32(%rcx)
- test $8, R8(%rcx)
- je L(aent)
- jmp L(uent)
-C *****************************************************************************
-
-C Handle the case when ap != rp (mod 16).
-
- ALIGN(16)
-L(utop):movq (ap,n,8), %xmm1
- punpcklqdq 8(ap,n,8), %xmm1
- movdqa -8(ap,n,8), %xmm0
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(uent):sub $2, n
- ja L(utop)
-
- jne L(end8)
-
- movq (ap), %xmm1
- pxor %xmm0, %xmm0
- punpcklqdq %xmm1, %xmm0
- punpcklqdq 8(ap), %xmm1
- psllq %xmm4, %xmm1
- psrlq %xmm5, %xmm0
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
-C Handle the case when ap = rp (mod 16).
-
- ALIGN(16)
-L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
- movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
- punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp,n,8)
-L(aent):sub $2, n
- ja L(atop)
-
- jne L(end8)
-
- movdqa (ap), %xmm0
- pxor %xmm1, %xmm1
- punpcklqdq %xmm0, %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movdqa %xmm0, (rp)
- ret
-C *****************************************************************************
-
- ALIGN(16)
-L(le2): jne L(end8)
-
- movq 8(ap), %xmm0
- movq (ap), %xmm1
- psllq %xmm4, %xmm0
- psrlq %xmm5, %xmm1
- por %xmm1, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, 8(rp)
-
-L(end8):movq (ap), %xmm0
- psllq %xmm4, %xmm0
- pxor %xmm7, %xmm0
- movq %xmm0, (rp)
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm
deleted file mode 100644
index 1e270b13c3..0000000000
--- a/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb good
-C aligned unaligned best seen for cpu?
-C AMD K8,K9 3 3 2.35 no, use shl/shr
-C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
-C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
-C AMD bobcat 3.17 3.17 yes, bad for n < 20
-C Intel P4 4.67 4.67 2.7 no, slow movdqu
-C Intel core2 2.15 2.15 1.25 no, use shld/shrd
-C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
-C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
-C Intel atom 11.7 11.7 4.5 no
-C VIA nano 5.7 5.95 2.0 no, slow movdqu
-
-C We try to do as many aligned 16-byte operations as possible. The top-most
-C and bottom-most writes might need 8-byte operations.
-C
-C This variant rely on fast load movdqu, and uses it even for aligned operands,
-C in order to avoid the need for two separate loops.
-C
-C TODO
-C * Could 2-limb wind-down code be simplified?
-C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
-C for other affected CPUs.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`ap', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ASM_START()
- TEXT
- ALIGN(64)
-PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- movd R32(%rcx), %xmm4
- mov $64, R32(%rax)
- sub R32(%rcx), R32(%rax)
- movd R32(%rax), %xmm5
-
- neg R32(%rcx)
- mov (ap), %rax
- shl R8(%rcx), %rax
-
- cmp $3, n
- jle L(bc)
-
- test $8, R8(rp)
- jz L(rp_aligned)
-
-C Do one initial limb in order to make rp aligned
- movq (ap), %xmm0
- movq 8(ap), %xmm1
- psrlq %xmm4, %xmm0
- psllq %xmm5, %xmm1
- por %xmm1, %xmm0
- movq %xmm0, (rp)
- lea 8(ap), ap
- lea 8(rp), rp
- dec n
-
-L(rp_aligned):
- lea 1(n), %r8d
- lea (ap,n,8), ap
- lea (rp,n,8), rp
- neg n
-
- and $6, R32(%r8)
- jz L(bu0)
- cmp $4, R32(%r8)
- jz L(bu4)
- jc L(bu2)
-L(bu6): add $4, n
- jmp L(i56)
-L(bu0): add $6, n
- jmp L(i70)
-L(bu4): add $2, n
- jmp L(i34)
-L(bu2): add $8, n
- jge L(end)
-
- ALIGN(16)
-L(top): movdqu -64(ap,n,8), %xmm1
- movdqu -56(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -64(rp,n,8)
-L(i70):
- movdqu -48(ap,n,8), %xmm1
- movdqu -40(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -48(rp,n,8)
-L(i56):
- movdqu -32(ap,n,8), %xmm1
- movdqu -24(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -32(rp,n,8)
-L(i34):
- movdqu -16(ap,n,8), %xmm1
- movdqu -8(ap,n,8), %xmm0
- psllq %xmm5, %xmm0
- psrlq %xmm4, %xmm1
- por %xmm1, %xmm0
- movdqa %xmm0, -16(rp,n,8)
- add $8, n
- jl L(top)
-
-L(end): test $1, R8(n)
- jnz L(e1)
-
- movdqu -16(ap), %xmm1
- movq -8(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movdqa %xmm0, -16(rp)
- FUNC_EXIT()
- ret
-
-L(e1): movq -8(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, -8(rp)
- FUNC_EXIT()
- ret
-
-C Basecase
- ALIGN(16)
-L(bc): dec R32(n)
- jnz 1f
- movq (ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, (rp)
- FUNC_EXIT()
- ret
-
-1: movq (ap), %xmm1
- movq 8(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, (rp)
- dec R32(n)
- jnz 1f
- movq 8(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, 8(rp)
- FUNC_EXIT()
- ret
-
-1: movq 8(ap), %xmm1
- movq 16(ap), %xmm0
- psrlq %xmm4, %xmm1
- psllq %xmm5, %xmm0
- por %xmm1, %xmm0
- movq %xmm0, 8(rp)
- movq 16(ap), %xmm0
- psrlq %xmm4, %xmm0
- movq %xmm0, 16(rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp/mpn/x86_64/fastsse/sec_tabselect.asm
deleted file mode 100644
index e3df110be4..0000000000
--- a/gmp/mpn/x86_64/fastsse/sec_tabselect.asm
+++ /dev/null
@@ -1,192 +0,0 @@
-dnl AMD64 SSE mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb cycles/limb cycles/limb
-C ali,evn n unal,evn n other cases
-C AMD K8,K9 1.65 1.65 1.8
-C AMD K10 0.78 0.78 0.85
-C AMD bd1 0.80 0.91 1.25
-C AMD bobcat 2.15 2.15 2.37
-C Intel P4 2.5 2.5 2.95
-C Intel core2 1.17 1.25 1.25
-C Intel NHM 0.87 0.90 0.90
-C Intel SBR 0.63 0.79 0.77
-C Intel atom 4.3 4.3 4.3 slower than plain code
-C VIA nano 1.4 5.1 3.14 too alignment dependent
-
-C NOTES
-C * We only honour the least significant 32 bits of the `which' and `nents'
-C arguments to allow efficient code using just SSE2. We would need to
-C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
-C * We use movd for copying between xmm and plain registers, since old gas
-C rejects movq. But gas assembles movd as movq when given a 64-bit greg.
-
-define(`rp', `%rdi')
-define(`tp', `%rsi')
-define(`n', `%rdx')
-define(`nents', `%rcx')
-define(`which', `%r8')
-
-define(`i', `%r10')
-define(`j', `%r9')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C nents n rp tab which j i temp * * * *
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
-
- movd which, %xmm8
- pshufd $0, %xmm8, %xmm8 C 4 `which' copies
- mov $1, R32(%rax)
- movd %rax, %xmm9
- pshufd $0, %xmm9, %xmm9 C 4 copies of 1
-
- mov n, j
- add $-8, j
- js L(outer_end)
-
-L(outer_top):
- mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5
- pxor %xmm6, %xmm6
- pxor %xmm7, %xmm7
- ALIGN(16)
-L(top): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- movdqu 16(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm4
- por %xmm3, %xmm5
- movdqu 32(tp), %xmm2
- movdqu 48(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm6
- por %xmm3, %xmm7
- lea (tp,n,8), tp
- add $-1, i
- jne L(top)
-
- movdqu %xmm4, 0(rp)
- movdqu %xmm5, 16(rp)
- movdqu %xmm6, 32(rp)
- movdqu %xmm7, 48(rp)
-
- lea 64(%r11), tp
- lea 64(rp), rp
- add $-8, j
- jns L(outer_top)
-L(outer_end):
-
- test $4, R8(n)
- je L(b0xx)
-L(b1xx):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- pxor %xmm5, %xmm5
- ALIGN(16)
-L(tp4): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- movdqu 16(tp), %xmm3
- pand %xmm0, %xmm2
- pand %xmm0, %xmm3
- por %xmm2, %xmm4
- por %xmm3, %xmm5
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp4)
- movdqu %xmm4, 0(rp)
- movdqu %xmm5, 16(rp)
- lea 32(%r11), tp
- lea 32(rp), rp
-
-L(b0xx):test $2, R8(n)
- je L(b00x)
-L(b01x):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- ALIGN(16)
-L(tp2): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movdqu 0(tp), %xmm2
- pand %xmm0, %xmm2
- por %xmm2, %xmm4
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp2)
- movdqu %xmm4, 0(rp)
- lea 16(%r11), tp
- lea 16(rp), rp
-
-L(b00x):test $1, R8(n)
- je L(b000)
-L(b001):mov nents, i
- mov tp, %r11
- pxor %xmm13, %xmm13
- pxor %xmm4, %xmm4
- ALIGN(16)
-L(tp1): movdqa %xmm8, %xmm0
- pcmpeqd %xmm13, %xmm0
- paddd %xmm9, %xmm13
- movq 0(tp), %xmm2
- pand %xmm0, %xmm2
- por %xmm2, %xmm4
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp1)
- movq %xmm4, 0(rp)
-
-L(b000):FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fat/fat.c b/gmp/mpn/x86_64/fat/fat.c
deleted file mode 100644
index 1b3f4e48be..0000000000
--- a/gmp/mpn/x86_64/fat/fat.c
+++ /dev/null
@@ -1,368 +0,0 @@
-/* x86_64 fat binary initializers.
-
- Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
- Torbjorn Granlund (port to x86_64)
-
- THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
- THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
- COMPLETELY IN FUTURE GNU MP RELEASES.
-
-Copyright 2003, 2004, 2009, 2011-2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#include <stdio.h> /* for printf */
-#include <stdlib.h> /* for getenv */
-#include <string.h>
-
-#include "gmp.h"
-#include "gmp-impl.h"
-
-/* Change this to "#define TRACE(x) x" for some traces. */
-#define TRACE(x)
-
-
-/* fat_entry.asm */
-long __gmpn_cpuid (char [12], int);
-
-
-#if WANT_FAKE_CPUID
-/* The "name"s in the table are values for the GMP_CPU_TYPE environment
- variable. Anything can be used, but for now it's the canonical cpu types
- as per config.guess/config.sub. */
-
-#define __gmpn_cpuid fake_cpuid
-
-#define MAKE_FMS(family, model) \
- ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \
- + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12))
-
-static struct {
- const char *name;
- const char vendor[13];
- unsigned fms;
-} fake_cpuid_table[] = {
- { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) },
- { "coreinhm", "GenuineIntel", MAKE_FMS (6, 0x1a) },
- { "coreiwsm", "GenuineIntel", MAKE_FMS (6, 0x25) },
- { "coreisbr", "GenuineIntel", MAKE_FMS (6, 0x2a) },
- { "coreihwl", "GenuineIntel", MAKE_FMS (6, 0x3c) },
- { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) },
- { "pentium4", "GenuineIntel", MAKE_FMS (15, 3) },
-
- { "k8", "AuthenticAMD", MAKE_FMS (15, 0) },
- { "k10", "AuthenticAMD", MAKE_FMS (16, 0) },
- { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) },
- { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) },
- { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
- { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
- { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) },
- { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) },
-
- { "nano", "CentaurHauls", MAKE_FMS (6, 15) },
-};
-
-static int
-fake_cpuid_lookup (void)
-{
- char *s;
- int i;
-
- s = getenv ("GMP_CPU_TYPE");
- if (s == NULL)
- {
- printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
- abort ();
- }
-
- for (i = 0; i < numberof (fake_cpuid_table); i++)
- if (strcmp (s, fake_cpuid_table[i].name) == 0)
- return i;
-
- printf ("GMP_CPU_TYPE=%s unknown\n", s);
- abort ();
-}
-
-static long
-fake_cpuid (char dst[12], unsigned int id)
-{
- int i = fake_cpuid_lookup();
-
- switch (id) {
- case 0:
- memcpy (dst, fake_cpuid_table[i].vendor, 12);
- return 0;
- case 1:
- return fake_cpuid_table[i].fms;
- case 7:
- dst[0] = 0xff; /* BMI1, AVX2, etc */
- dst[1] = 0xff; /* BMI2, etc */
- return 0;
- case 0x80000001:
- dst[4 + 29 / 8] = (1 << (29 % 8)); /* "long" mode */
- return 0;
- default:
- printf ("fake_cpuid(): oops, unknown id %d\n", id);
- abort ();
- }
-}
-#endif
-
-
-typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
-typedef DECL_preinv_mod_1 ((*preinv_mod_1_t));
-
-struct cpuvec_t __gmpn_cpuvec = {
- __MPN(add_n_init),
- __MPN(addlsh1_n_init),
- __MPN(addlsh2_n_init),
- __MPN(addmul_1_init),
- __MPN(addmul_2_init),
- __MPN(bdiv_dbm1c_init),
- __MPN(cnd_add_n_init),
- __MPN(cnd_sub_n_init),
- __MPN(com_init),
- __MPN(copyd_init),
- __MPN(copyi_init),
- __MPN(divexact_1_init),
- __MPN(divrem_1_init),
- __MPN(gcd_1_init),
- __MPN(lshift_init),
- __MPN(lshiftc_init),
- __MPN(mod_1_init),
- __MPN(mod_1_1p_init),
- __MPN(mod_1_1p_cps_init),
- __MPN(mod_1s_2p_init),
- __MPN(mod_1s_2p_cps_init),
- __MPN(mod_1s_4p_init),
- __MPN(mod_1s_4p_cps_init),
- __MPN(mod_34lsub1_init),
- __MPN(modexact_1c_odd_init),
- __MPN(mul_1_init),
- __MPN(mul_basecase_init),
- __MPN(mullo_basecase_init),
- __MPN(preinv_divrem_1_init),
- __MPN(preinv_mod_1_init),
- __MPN(redc_1_init),
- __MPN(redc_2_init),
- __MPN(rshift_init),
- __MPN(sqr_basecase_init),
- __MPN(sub_n_init),
- __MPN(sublsh1_n_init),
- __MPN(submul_1_init),
- 0
-};
-
-int __gmpn_cpuvec_initialized = 0;
-
-/* The following setups start with generic x86, then overwrite with
- specifics for a chip, and higher versions of that chip.
-
- The arrangement of the setups here will normally be the same as the $path
- selections in configure.in for the respective chips.
-
- This code is reentrant and thread safe. We always calculate the same
- decided_cpuvec, so if two copies of the code are running it doesn't
- matter which completes first, both write the same to __gmpn_cpuvec.
-
- We need to go via decided_cpuvec because if one thread has completed
- __gmpn_cpuvec then it may be making use of the threshold values in that
- vector. If another thread is still running __gmpn_cpuvec_init then we
- don't want it to write different values to those fields since some of the
- asm routines only operate correctly up to their own defined threshold,
- not an arbitrary value. */
-
-void
-__gmpn_cpuvec_init (void)
-{
- struct cpuvec_t decided_cpuvec;
- char vendor_string[13];
- char dummy_string[12];
- long fms;
- int family, model;
-
- TRACE (printf ("__gmpn_cpuvec_init:\n"));
-
- memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
-
- CPUVEC_SETUP_x86_64;
- CPUVEC_SETUP_fat;
-
- __gmpn_cpuid (vendor_string, 0);
- vendor_string[12] = 0;
-
- fms = __gmpn_cpuid (dummy_string, 1);
- family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
- model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
-
- /* Check extended feature flags */
- __gmpn_cpuid (dummy_string, 0x80000001);
- if ((dummy_string[4 + 29 / 8] & (1 << (29 % 8))) == 0)
- abort (); /* longmode-capable-bit turned off! */
-
- /*********************************************************/
- /*** WARNING: keep this list in sync with config.guess ***/
- /*********************************************************/
- if (strcmp (vendor_string, "GenuineIntel") == 0)
- {
- switch (family)
- {
- case 6:
- switch (model)
- {
- case 0x0f: /* Conroe Merom Kentsfield Allendale */
- case 0x10:
- case 0x11:
- case 0x12:
- case 0x13:
- case 0x14:
- case 0x15:
- case 0x16:
- case 0x17: /* PNR Wolfdale Yorkfield */
- case 0x18:
- case 0x19:
- case 0x1d: /* PNR Dunnington */
- CPUVEC_SETUP_core2;
- break;
-
- case 0x1c: /* Atom Silverthorne */
- case 0x26: /* Atom Lincroft */
- case 0x27: /* Atom Saltwell? */
- case 0x36: /* Atom Cedarview/Saltwell */
- CPUVEC_SETUP_atom;
- break;
-
- case 0x1a: /* NHM Gainestown */
- case 0x1b:
- case 0x1e: /* NHM Lynnfield/Jasper */
- case 0x1f:
- case 0x20:
- case 0x21:
- case 0x22:
- case 0x23:
- case 0x24:
- case 0x25: /* WSM Clarkdale/Arrandale */
- case 0x28:
- case 0x29:
- case 0x2b:
- case 0x2c: /* WSM Gulftown */
- case 0x2e: /* NHM Beckton */
- case 0x2f: /* WSM Eagleton */
- case 0x37: /* Atom Silvermont */
- case 0x4d: /* Atom Silvermont/Avoton */
- CPUVEC_SETUP_core2;
- CPUVEC_SETUP_coreinhm;
- break;
-
- case 0x2a: /* SB */
- case 0x2d: /* SBC-EP */
- case 0x3a: /* IBR */
- case 0x3e: /* IBR Ivytown */
- CPUVEC_SETUP_core2;
- CPUVEC_SETUP_coreinhm;
- CPUVEC_SETUP_coreisbr;
- break;
- case 0x3c: /* Haswell client */
- case 0x3d: /* Broadwell */
- case 0x3f: /* Haswell server */
- case 0x45: /* Haswell ULT */
- case 0x46: /* Crystal Well */
- case 0x4f: /* Broadwell server */
- case 0x56: /* Broadwell microserver */
- CPUVEC_SETUP_core2;
- CPUVEC_SETUP_coreinhm;
- CPUVEC_SETUP_coreisbr;
- /* Some Haswells lack BMI2. Let them appear as Sandybridges for
- now. */
- __gmpn_cpuid (dummy_string, 7);
- if ((dummy_string[0 + 8 / 8] & (1 << (8 % 8))) != 0)
- CPUVEC_SETUP_coreihwl;
- break;
- }
- break;
-
- case 15:
- CPUVEC_SETUP_pentium4;
- break;
- }
- }
- else if (strcmp (vendor_string, "AuthenticAMD") == 0)
- {
- switch (family)
- {
- case 0x0f: /* k8 */
- case 0x11: /* "fam 11h", mix of k8 and k10 */
- case 0x13:
- case 0x17:
- CPUVEC_SETUP_k8;
- break;
-
- case 0x10: /* k10 */
- case 0x12: /* k10 (llano) */
- CPUVEC_SETUP_k8;
- CPUVEC_SETUP_k10;
- break;
-
- case 0x14: /* bobcat */
- case 0x16: /* jaguar */
- CPUVEC_SETUP_k8;
- CPUVEC_SETUP_k10;
- CPUVEC_SETUP_bobcat;
- break;
-
- case 0x15: /* bulldozer, piledriver, steamroller, excavator */
- CPUVEC_SETUP_k8;
- CPUVEC_SETUP_k10;
- CPUVEC_SETUP_bd1;
- }
- }
- else if (strcmp (vendor_string, "CentaurHauls") == 0)
- {
- switch (family)
- {
- case 6:
- if (model >= 15)
- CPUVEC_SETUP_nano;
- break;
- }
- }
-
- /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
- Instead default to the plain versions from whichever CPU we detected.
- The function arguments are compatible, no need for any glue code. */
- if (decided_cpuvec.preinv_divrem_1 == NULL)
- decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
- if (decided_cpuvec.preinv_mod_1 == NULL)
- decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1;
-
- ASSERT_CPUVEC (decided_cpuvec);
- CPUVEC_INSTALL (decided_cpuvec);
-
- /* Set this once the threshold fields are ready.
- Use volatile to prevent it getting moved. */
- *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
-}
diff --git a/gmp/mpn/x86_64/fat/fat_entry.asm b/gmp/mpn/x86_64/fat/fat_entry.asm
deleted file mode 100644
index 764e3d82f2..0000000000
--- a/gmp/mpn/x86_64/fat/fat_entry.asm
+++ /dev/null
@@ -1,204 +0,0 @@
-dnl x86 fat binary entrypoints.
-
-dnl Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
-dnl Torbjorn Granlund (port to x86_64)
-
-dnl Copyright 2003, 2009, 2011-2014 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-dnl Forcibly disable profiling.
-dnl
-dnl The entrypoints and inits are small enough not to worry about, the real
-dnl routines arrived at will have any profiling. Also, the way the code
-dnl here ends with a jump means we won't work properly with the
-dnl "instrument" profiling scheme anyway.
-
-define(`WANT_PROFILING',no)
-
-
-dnl We define PRETEND_PIC as a helper symbol, the use it for suppressing
-dnl normal, fast call code, since that triggers problems on Darwin and
-dnl OpenBSD.
-
-ifdef(`DARWIN',
-`define(`PRETEND_PIC')')
-ifdef(`OPENBSD',
-`define(`PRETEND_PIC')')
-ifdef(`PIC',
-`define(`PRETEND_PIC')')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
- TEXT
-
-dnl Usage: FAT_ENTRY(name, offset)
-dnl
-dnl Emit a fat binary entrypoint function of the given name. This is the
-dnl normal entry for applications, eg. __gmpn_add_n.
-dnl
-dnl The code simply jumps through the function pointer in __gmpn_cpuvec at
-dnl the given "offset" (in bytes).
-dnl
-dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
-dnl fine for all x86s.
-dnl
-dnl For ELF/DARWIN PIC, the jumps are 20 bytes each, and are best aligned to
-dnl 16 to ensure at least the first two instructions don't cross a cache line
-dnl boundary.
-dnl
-dnl For DOS64, the jumps are 6 bytes. The same form works also for GNU/Linux
-dnl (at least with certain assembler/linkers) but FreeBSD 8.2 crashes. Not
-dnl tested on Darwin, Slowaris, NetBSD, etc.
-dnl
-dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
-dnl grepping in configure, stopping that code trying to eval something with
-dnl $1 in it.
-
-define(FAT_ENTRY,
-m4_assert_numargs(2)
-`ifdef(`HOST_DOS64',
-` ALIGN(8)
-`'PROLOGUE($1)
- jmp *$2+GSYM_PREFIX`'__gmpn_cpuvec(%rip)
-EPILOGUE()
-',
-` ALIGN(ifdef(`PIC',16,8))
-`'PROLOGUE($1)
-ifdef(`PRETEND_PIC',
-` LEA( GSYM_PREFIX`'__gmpn_cpuvec, %rax)
- jmp *$2(%rax)
-',`dnl non-PIC
- jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2
-')
-EPILOGUE()
-')')
-
-
-dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST
-dnl
-
-define(`CPUVEC_offset',0)
-foreach(i,
-`FAT_ENTRY(MPN(i),CPUVEC_offset)
-define(`CPUVEC_offset',eval(CPUVEC_offset + 8))',
-CPUVEC_FUNCS_LIST)
-
-
-dnl Usage: FAT_INIT(name, offset)
-dnl
-dnl Emit a fat binary initializer function of the given name. These
-dnl functions are the initial values for the pointers in __gmpn_cpuvec.
-dnl
-dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through
-dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
-dnl __gmpn_cpuvec_init will have stored the address of the selected
-dnl implementation there.
-dnl
-dnl Only one of these routines will be executed, and only once, since after
-dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no
-dnl need for anything special here, just something small and simple. To
-dnl keep code size down, "fat_init" is a shared bit of code, arrived at
-dnl with the offset in %al. %al is used since the movb instruction is 2
-dnl bytes where %eax would be 4.
-dnl
-dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
-dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval
-dnl something with $1 in it.
-dnl
-dnl We need to preserve parameter registers over the __gmpn_cpuvec_init call
-
-define(FAT_INIT,
-m4_assert_numargs(2)
-`PROLOGUE($1)
- mov $`'$2, %al
- jmp L(fat_init)
-EPILOGUE()
-')
-
-dnl FAT_INIT for each CPUVEC_FUNCS_LIST
-dnl
-
-define(`CPUVEC_offset',0)
-foreach(i,
-`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
-define(`CPUVEC_offset',eval(CPUVEC_offset + 1))',
-CPUVEC_FUNCS_LIST)
-
-L(fat_init):
- C al __gmpn_cpuvec byte offset
-
- movzbl %al, %eax
-IFSTD(` push %rdi ')
-IFSTD(` push %rsi ')
- push %rdx
- push %rcx
- push %r8
- push %r9
- push %rax
- CALL( __gmpn_cpuvec_init)
- pop %rax
- pop %r9
- pop %r8
- pop %rcx
- pop %rdx
-IFSTD(` pop %rsi ')
-IFSTD(` pop %rdi ')
-ifdef(`PRETEND_PIC',`
- LEA( GSYM_PREFIX`'__gmpn_cpuvec, %r10)
- jmp *(%r10,%rax,8)
-',`dnl non-PIC
- jmp *GSYM_PREFIX`'__gmpn_cpuvec(,%rax,8)
-')
-
-
-C long __gmpn_cpuid (char dst[12], int id);
-C
-C This is called only 3 times, so just something simple and compact is fine.
-C
-C The rcx/ecx zeroing here is needed for the BMI2 check.
-
-define(`rp', `%rdi')
-define(`idx', `%rsi')
-
-PROLOGUE(__gmpn_cpuid)
- FUNC_ENTRY(2)
- mov %rbx, %r8
- mov R32(idx), R32(%rax)
- xor %ecx, %ecx
- cpuid
- mov %ebx, (rp)
- mov %edx, 4(rp)
- mov %ecx, 8(rp)
- mov %r8, %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/fat/gmp-mparam.h b/gmp/mpn/x86_64/fat/gmp-mparam.h
deleted file mode 100644
index 005c893635..0000000000
--- a/gmp/mpn/x86_64/fat/gmp-mparam.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Fat binary x86_64 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2003, 2009, 2011 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-
-/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time
- this might not be true currently is for actual 80386 and 80486 chips,
- where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
- that's not worth worrying about. */
-#define DIVEXACT_1_THRESHOLD 0
-
-/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
- USE_PREINV_DIVREM_1 so that all callers use it, and then let the
- __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
- preinv. */
-#define USE_PREINV_DIVREM_1 1
-
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
- for mpn_sqr to call the latter. */
-#define SQR_BASECASE_THRESHOLD 0
-
-/* Sensible fallbacks for these, when not taken from a cpu-specific
- gmp-mparam.h. */
-#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 130
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 200
-
-/* These are values more or less in the middle of what the typical x86 chips
- come out as. For a fat binary it's necessary to have values for these,
- since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
- as non-constant array initializers. FIXME: Perhaps these should be done
- in the cpuvec structure like other thresholds. */
-#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 }
-#define MUL_FFT_MODF_THRESHOLD 400
-#define MUL_FFT_THRESHOLD 2000
-
-#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 }
-#define SQR_FFT_MODF_THRESHOLD 500
-#define SQR_FFT_THRESHOLD 3000
diff --git a/gmp/mpn/x86_64/fat/mod_1.c b/gmp/mpn/x86_64/fat/mod_1.c
deleted file mode 100644
index 4f149cc353..0000000000
--- a/gmp/mpn/x86_64/fat/mod_1.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_mod_1.
-
-Copyright 2003, 2009 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/mod_1.c"
diff --git a/gmp/mpn/x86_64/fat/mul_basecase.c b/gmp/mpn/x86_64/fat/mul_basecase.c
deleted file mode 100644
index d9eb4718c2..0000000000
--- a/gmp/mpn/x86_64/fat/mul_basecase.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_mul_basecase.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/mul_basecase.c"
diff --git a/gmp/mpn/x86_64/fat/mullo_basecase.c b/gmp/mpn/x86_64/fat/mullo_basecase.c
deleted file mode 100644
index 7f86be64c5..0000000000
--- a/gmp/mpn/x86_64/fat/mullo_basecase.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_mullo_basecase.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/mullo_basecase.c"
diff --git a/gmp/mpn/x86_64/fat/redc_1.c b/gmp/mpn/x86_64/fat/redc_1.c
deleted file mode 100644
index 0025403353..0000000000
--- a/gmp/mpn/x86_64/fat/redc_1.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_redc_1.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/redc_1.c"
diff --git a/gmp/mpn/x86_64/fat/redc_2.c b/gmp/mpn/x86_64/fat/redc_2.c
deleted file mode 100644
index 1932d58323..0000000000
--- a/gmp/mpn/x86_64/fat/redc_2.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_redc_2.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/redc_2.c"
diff --git a/gmp/mpn/x86_64/fat/sqr_basecase.c b/gmp/mpn/x86_64/fat/sqr_basecase.c
deleted file mode 100644
index d1c5dcd2e0..0000000000
--- a/gmp/mpn/x86_64/fat/sqr_basecase.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Fat binary fallback mpn_sqr_basecase.
-
-Copyright 2012 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-
-#include "mpn/generic/sqr_basecase.c"
diff --git a/gmp/mpn/x86_64/gcd_1.asm b/gmp/mpn/x86_64/gcd_1.asm
deleted file mode 100644
index 252d4174eb..0000000000
--- a/gmp/mpn/x86_64/gcd_1.asm
+++ /dev/null
@@ -1,163 +0,0 @@
-dnl AMD64 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
-dnl Granlund.
-
-dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/bit (approx)
-C AMD K8,K9 5.21 (4.95)
-C AMD K10 5.15 (5.00)
-C AMD bd1 5.42 (5.14)
-C AMD bobcat 6.71 (6.56)
-C Intel P4 13.5 (12.75)
-C Intel core2 6.20 (6.16)
-C Intel NHM 6.49 (6.25)
-C Intel SBR 7.75 (7.57)
-C Intel atom 8.77 (8.54)
-C VIA nano 6.60 (6.20)
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-DEF_OBJECT(ctz_table,64)
- .byte MAXSHIFT
-forloop(i,1,MASK,
-` .byte m4_count_trailing_zeros(i)
-')
-END_OBJECT(ctz_table)
-
-C Threshold of when to call bmod when U is one limb. Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 8)
-
-C INPUT PARAMETERS
-define(`up', `%rdi')
-define(`n', `%rsi')
-define(`v0', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_gcd_1)
- FUNC_ENTRY(3)
- mov (up), %rax C U low limb
- mov $-1, R32(%rcx)
- or v0, %rax C x | y
-
-L(twos):
- inc R32(%rcx)
- shr %rax
- jnc L(twos)
-
- shr R8(%rcx), v0
- push %rcx C common twos
-
-L(divide_strip_y):
- shr v0
- jnc L(divide_strip_y)
- adc v0, v0
-
- cmp $1, n
- jnz L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
- mov (up), %r8
- mov %r8, %rax
- shr $BMOD_THRES_LOG2, %r8
- cmp %r8, v0
- ja L(noreduce)
- push v0
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-
-L(bmod):
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_modexact_1_odd)
-
-L(reduced):
- add $STACK_ALLOC, %rsp
- pop %rdx
-
-L(noreduce):
- LEA( ctz_table, %rsi)
- test %rax, %rax
- mov %rax, %rcx
- jnz L(mid)
- jmp L(end)
-
-L(reduce_nby1):
- push v0
- sub $STACK_ALLOC, %rsp C maintain ABI required rsp alignment
-
- cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
- jl L(bmod)
-IFDOS(` mov %rdx, %r8 ')
-IFDOS(` mov %rsi, %rdx ')
-IFDOS(` mov %rdi, %rcx ')
- CALL( mpn_mod_1)
- jmp L(reduced)
-
- ALIGN(16) C K8 BC P4 NHM SBR
-L(top): cmovc %rcx, %rax C if x-y < 0 0
- cmovc %rdi, %rdx C use x,y-x 0
-L(mid): and $MASK, R32(%rcx) C 0
- movzbl (%rsi,%rcx), R32(%rcx) C 1
- jz L(shift_alot) C 1
- shr R8(%rcx), %rax C 3
- mov %rax, %rdi C 4
- mov %rdx, %rcx C 3
- sub %rax, %rcx C 4
- sub %rdx, %rax C 4
- jnz L(top) C 5
-
-L(end): pop %rcx
- mov %rdx, %rax
- shl R8(%rcx), %rax
- FUNC_EXIT()
- ret
-
-L(shift_alot):
- shr $MAXSHIFT, %rax
- mov %rax, %rcx
- jmp L(mid)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/gmp-mparam.h b/gmp/mpn/x86_64/gmp-mparam.h
index 0dea8c94cd..5e2ed40332 100644
--- a/gmp/mpn/x86_64/gmp-mparam.h
+++ b/gmp/mpn/x86_64/gmp-mparam.h
@@ -1,218 +1,79 @@
-/* AMD K8-K10 gmp-mparam.h -- Compiler/machine parameter header file.
+/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2010, 2012 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-or
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
-or both in parallel, as here.
+/* 2200 MHz Opteron / rev A / 1024 Kibyte cache / socket 940 */
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 15
-
-#define MUL_TOOM22_THRESHOLD 27
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 234
-#define MUL_TOOM6H_THRESHOLD 418
-#define MUL_TOOM8H_THRESHOLD 466
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 175
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 36
-#define SQR_TOOM3_THRESHOLD 117
-#define SQR_TOOM4_THRESHOLD 327
-#define SQR_TOOM6_THRESHOLD 446
-#define SQR_TOOM8_THRESHOLD 547
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define POWM_SEC_TABLE 2,67,322,991
-
-#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 25, 8}, { 13, 7}, { 29, 8}, \
- { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
- { 23, 7}, { 47, 8}, { 25, 7}, { 51, 8}, \
- { 29, 9}, { 15, 8}, { 37, 9}, { 19, 8}, \
- { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
- { 55,10}, { 15, 9}, { 43,10}, { 23, 9}, \
- { 55,10}, { 31, 9}, { 63, 5}, { 1023, 4}, \
- { 2431, 5}, { 1279, 6}, { 671, 7}, { 367, 8}, \
- { 189, 9}, { 95, 8}, { 195, 9}, { 111,11}, \
- { 31, 9}, { 131,10}, { 71, 9}, { 155,10}, \
- { 79, 9}, { 159,10}, { 87,11}, { 47,10}, \
- { 111,11}, { 63,10}, { 135,11}, { 79,10}, \
- { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
- { 63,11}, { 143,10}, { 287,11}, { 159,10}, \
- { 319,11}, { 175,12}, { 95,11}, { 207,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 543,11}, \
- { 287,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
- { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
- { 223,13}, { 127,12}, { 255,11}, { 543,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
- { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
- { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,14}, \
- { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \
- { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \
- { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \
- { 1535,12}, { 799,11}, { 1599,12}, { 831,13}, \
- { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
- { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
- { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,14}, { 1279,13}, { 2687,14}, { 1407,15}, \
- { 767,14}, { 1535,13}, { 3071,14}, { 1791,16}, \
- { 511,15}, { 1023,14}, { 2431,15}, { 1279,14}, \
- { 2815,15}, { 1535,14}, { 3199,15}, { 1791,14}, \
- { 3583,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 185
-#define MUL_FFT_THRESHOLD 7552
-
-#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 27, 7}, { 14, 6}, \
- { 29, 7}, { 15, 6}, { 31, 7}, { 29, 8}, \
- { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
- { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \
- { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \
- { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 83,10}, { 47, 6}, { 767, 4}, \
- { 3263, 5}, { 1727, 4}, { 3455, 5}, { 1791, 6}, \
- { 927, 7}, { 479, 6}, { 959, 7}, { 511, 8}, \
- { 271, 9}, { 147,10}, { 87,11}, { 47,10}, \
- { 95,12}, { 31,11}, { 63,10}, { 135,11}, \
- { 79,10}, { 167,11}, { 95,10}, { 191,11}, \
- { 111,12}, { 63,11}, { 127,10}, { 255,11}, \
- { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
- { 159,12}, { 95,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 399,11}, { 207,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
- { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,10}, { 831,11}, { 447,13}, \
- { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
- { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
- { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
- { 1279,11}, { 671,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
- { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
- { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
- { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
- { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
- { 831,13}, { 447,12}, { 959,14}, { 255,13}, \
- { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
- { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
- { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \
- { 1663,13}, { 895,12}, { 1791,13}, { 959,15}, \
- { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
- { 1215,14}, { 639,13}, { 1471,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1855,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2303,14}, \
- { 1279,13}, { 2559,14}, { 1407,15}, { 767,14}, \
- { 1535,13}, { 3071,14}, { 1791,16}, { 511,15}, \
- { 1023,14}, { 2303,15}, { 1279,14}, { 2687,15}, \
- { 1535,14}, { 3199,15}, { 1791,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 203
-#define SQR_FFT_THRESHOLD 5248
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 35
-#define MULLO_MUL_N_THRESHOLD 15604
-
-#define DC_DIV_QR_THRESHOLD 56
-#define DC_DIVAPPR_Q_THRESHOLD 220
-#define DC_BDIV_QR_THRESHOLD 52
-#define DC_BDIV_Q_THRESHOLD 152
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 226
-#define INV_APPR_THRESHOLD 214
-
-#define BINV_NEWTON_THRESHOLD 327
-#define REDC_1_TO_REDC_2_THRESHOLD 4
-#define REDC_2_TO_REDC_N_THRESHOLD 79
-
-#define MU_DIV_QR_THRESHOLD 1895
-#define MU_DIVAPPR_Q_THRESHOLD 1895
-#define MUPI_DIV_QR_THRESHOLD 106
-#define MU_BDIV_QR_THRESHOLD 1589
-#define MU_BDIV_Q_THRESHOLD 1718
-
-#define MATRIX22_STRASSEN_THRESHOLD 16
-#define HGCD_THRESHOLD 125
-#define HGCD_APPR_THRESHOLD 173
-#define HGCD_REDUCE_THRESHOLD 3524
-#define GCD_DC_THRESHOLD 555
-#define GCDEXT_DC_THRESHOLD 478
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 28
-#define SET_STR_DC_THRESHOLD 248
-#define SET_STR_PRECOMPUTE_THRESHOLD 1648
-
-#define FAC_DSC_THRESHOLD 1075
-#define FAC_ODD_THRESHOLD 0 /* always */
+/* Generated by tuneup.c, 2009-01-14, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 28
+#define MUL_TOOM3_THRESHOLD 97
+#define MUL_TOOM44_THRESHOLD 406
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 38
+#define SQR_TOOM3_THRESHOLD 133
+#define SQR_TOOM4_THRESHOLD 547
+
+#define MULLOW_BASECASE_THRESHOLD 27
+#define MULLOW_DC_THRESHOLD 28
+#define MULLOW_MUL_N_THRESHOLD 199
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 74
+#define POWM_THRESHOLD 146
+
+#define MATRIX22_STRASSEN_THRESHOLD 24
+#define HGCD_THRESHOLD 143
+#define GCD_DC_THRESHOLD 529
+#define GCDEXT_DC_THRESHOLD 639
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 4
+#define MOD_1_2_THRESHOLD 7
+#define MOD_1_4_THRESHOLD 64
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 32
+#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_PRECOMPUTE_THRESHOLD 2124
+
+#define MUL_FFT_TABLE { 432, 928, 2624, 3840, 11264, 36864, 147456, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 656
+#define MUL_FFT_THRESHOLD 7936
+
+#define SQR_FFT_TABLE { 432, 928, 2368, 4352, 11264, 28672, 114688, 327680, 0 }
+#define SQR_FFT_MODF_THRESHOLD 560
+#define SQR_FFT_THRESHOLD 7936
+
+#define MUL_FFT_TABLE2 {{1,4}, {337,5}, {673,6}, {1729,7}, {1793,6}, {2017,7}, {5633,8}, {11009,9}, {11777,8}, {14593,9}, {15873,8}, {16897,9}, {22017,10}, {23553,9}, {29697,10}, {31745,9}, {36353,10}, {39937,9}, {44545,10}, {48129,9}, {50689,10}, {56833,11}, {63489,10}, {78337,11}, {79873,10}, {86017,11}, {88065,10}, {92161,11}, {96257,10}, {106497,11}, {129025,10}, {141313,11}, {145409,10}, {146433,11}, {161793,10}, {167937,11}, {227329,12}, {258049,11}, {326657,12}, {389121,11}, {424961,13}, {516097,12}, {520193,11}, {528385,10}, {538625,11}, {547841,10}, {552961,11}, {587777,12}, {651265,11}, {718849,10}, {719873,12}, {782337,11}, {787457,10}, {791553,11}, {796673,10}, {802817,11}, {849921,10}, {850945,12}, {913409,11}, {915457,13}, {1040385,12}, {1044481,11}, {1112065,12}, {1175553,11}, {1243137,12}, {1306625,11}, {1374209,12}, {1437697,13}, {1564673,12}, {1568769,11}, {1581057,12}, {1585153,11}, {1595393,12}, {1597441,11}, {1630209,12}, {1699841,11}, {1761281,12}, {1830913,14}, {2080769,13}, {2088961,12}, {2486273,13}, {2613249,12}, {3010561,13}, {3137537,12}, {3534849,13}, {3661825,12}, {3928065,13}, {3964929,14}, {4014081,13}, {4046849,14}, {4136961,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5238785,13}, {5349377,12}, {5353473,13}, {5758977,12}, {5763073,14}, {6275073,13}, {7856129,14}, {8372225,13}, {9953281,14}, {10469377,13}, {12050433,14}, {12566529,13}, {13623297,14}, {14663681,13}, {15196161,15}, {16744449,14}, {16760833,13}, {17293313,14}, {18857985,13}, {19394561,14}, {MP_SIZE_T_MAX,0}}
+
+#define SQR_FFT_TABLE2 {{1,4}, {305,5}, {609,6}, {1601,7}, {4737,8}, {4865,7}, {5121,8}, {11009,9}, {11777,8}, {13057,9}, {13825,10}, {15361,9}, {15873,8}, {16129,9}, {22017,10}, {23553,9}, {28161,10}, {31745,9}, {36353,10}, {39937,9}, {42497,10}, {56321,11}, {63489,10}, {89601,11}, {96257,10}, {107521,12}, {126977,11}, {129025,10}, {135169,11}, {137217,10}, {139265,11}, {163841,10}, {173057,11}, {195073,9}, {196097,11}, {196609,10}, {201729,11}, {212993,12}, {217089,11}, {221185,12}, {258049,11}, {260609,10}, {261121,9}, {261633,11}, {292865,10}, {296961,11}, {299009,10}, {302081,11}, {325633,12}, {389121,11}, {392193,9}, {392705,11}, {393217,13}, {401409,11}, {404481,13}, {421889,11}, {424961,13}, {516097,12}, {520193,11}, {526337,10}, {532481,11}, {542721,10}, {543745,11}, {593921,12}, {598017,11}, {608257,12}, {610305,11}, {616449,12}, {651265,11}, {653313,10}, {687617,11}, {718849,10}, {749569,12}, {782337,11}, {784385,10}, {788481,11}, {793601,10}, {800769,11}, {802817,10}, {813057,11}, {850945,12}, {913409,11}, {917505,13}, {1040385,12}, {1044481,11}, {1113089,12}, {1175553,11}, {1243137,12}, {1309697,11}, {1347585,12}, {1351681,11}, {1368065,12}, {1437697,11}, {1503233,13}, {1564673,12}, {1568769,11}, {1628161,12}, {1839105,14}, {1851393,12}, {1884161,14}, {2080769,13}, {2088961,12}, {2488321,13}, {2613249,12}, {3010561,13}, {3137537,12}, {3403777,13}, {3661825,12}, {3928065,14}, {4177921,13}, {4186113,12}, {4452353,13}, {4710401,12}, {4976641,13}, {5234689,12}, {5500929,13}, {5758977,12}, {5763073,14}, {6275073,13}, {6283265,12}, {6549505,13}, {7856129,15}, {8011777,14}, {8060929,15}, {8355841,14}, {8372225,13}, {9953281,14}, {10469377,13}, {12050433,14}, {12566529,13}, {13623297,14}, {14663681,13}, {15196161,15}, {16744449,14}, {16760833,13}, {17293313,14}, {23052289,15}, {25133057,14}, {29343745,16}, {MP_SIZE_T_MAX,0}}
+#define INV_NEWTON_THRESHOLD 47
+#define BINV_NEWTON_THRESHOLD 18
diff --git a/gmp/mpn/x86_64/invert_limb.asm b/gmp/mpn/x86_64/invert_limb.asm
index cc79b89a2b..44fb83bd10 100644
--- a/gmp/mpn/x86_64/invert_limb.asm
+++ b/gmp/mpn/x86_64/invert_limb.asm
@@ -1,115 +1,121 @@
dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
-dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
-
-dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2004, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb (approx) div
-C AMD K8,K9 48 71
-C AMD K10 48 77
-C Intel P4 135 161
-C Intel core2 69 116
-C Intel corei 55 89
-C Intel atom 129 191
-C VIA nano 79 157
+C K8: 40 71
+C P4: 141 161
+C P6-15 (Core2): 63 116
+C P6-28 (Atom): 130 191
C rax rcx rdx rdi rsi r8
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-PROTECT(`mpn_invert_limb_table')
ASM_START()
TEXT
ALIGN(16)
-PROLOGUE(mpn_invert_limb) C Kn C2 Ci
- FUNC_ENTRY(1)
- mov %rdi, %rax C 0 0 0
- shr $55, %rax C 1 1 1
+PROLOGUE(mpn_invert_limb)
+ mov %rdi, %rax
+ shr $55, %rax
ifdef(`PIC',`
ifdef(`DARWIN',`
- mov mpn_invert_limb_table@GOTPCREL(%rip), %r8
+ mov approx_tab@GOTPCREL(%rip), %r8
add $-512, %r8
',`
- lea -512+mpn_invert_limb_table(%rip), %r8
+ lea -512+approx_tab(%rip), %r8
')',`
- movabs $-512+mpn_invert_limb_table, %r8
+ movabs $-512+approx_tab, %r8
')
- movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0
-
- C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
- mov %rdi, %rsi C 0 0 0
- mov R32(%rcx), R32(%rax) C 4 5 5
- imul R32(%rcx), R32(%rcx) C 4 5 5
- shr $24, %rsi C 1 1 1
- inc %rsi C %rsi = d40
- imul %rsi, %rcx C 8 10 8
- shr $40, %rcx C 12 15 11
- sal $11, R32(%rax) C 5 6 6
- dec R32(%rax)
- sub R32(%rcx), R32(%rax) C %rax = v1
-
- C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
- mov $0x1000000000000000, %rcx
- imul %rax, %rsi C 14 17 13
- sub %rsi, %rcx
- imul %rax, %rcx
- sal $13, %rax
- shr $47, %rcx
- add %rax, %rcx C %rcx = v2
-
- C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65
- mov %rdi, %rsi C 0 0 0
- shr %rsi C d/2
- sbb %rax, %rax C -d0 = -(d mod 2)
- sub %rax, %rsi C d63 = ceil(d/2)
- imul %rcx, %rsi C v2 * d63
- and %rcx, %rax C v2 * d0
- shr %rax C (v2>>1) * d0
- sub %rsi, %rax C (v2>>1) * d0 - v2 * d63
- mul %rcx
- sal $31, %rcx
- shr %rdx
- add %rdx, %rcx C %rcx = v3
-
- mov %rdi, %rax
- mul %rcx
- add %rdi, %rax
+ movzwl (%r8,%rax,2), R32(%rcx)
+ mov %rdi, %rsi
+ mov R32(%rcx), R32(%rax)
+ imul R32(%rcx), R32(%rcx)
+ shr $32, %rsi
+ imul %rsi, %rcx
+ shr $31, %rcx
+ sal $17, %rax
+ sub %rcx, %rax
+ mov %rax, %r8
+ imul %rax, %rax
+ sal $33, %r8
+ mul %rdi
+ neg %rdx
+ lea (%r8,%rdx,2), %rax
+ mov %rax, %r8
+ mul %rax
+ mov %rax, %rcx
+ mov %rdx, %rax
+ mul %rdi
+ mov %rax, %rsi
mov %rcx, %rax
- adc %rdi, %rdx
- sub %rdx, %rax
-
- FUNC_EXIT()
+ mov %rdx, %rcx
+ mul %rdi
+ add %rdx, %rsi
+ sbb %rcx, %r8
+ shr $62, %rsi
+ add $1, %rsi
+ sal $2, %r8
+ sub %rsi, %r8
+ mov %rdi, %rax
+ mul %r8
+ add %rdi, %rax C xl += d
+ adc %rdi, %rdx C xh += d
+ mov %r8, %rax
+ sub %rdx, %rax C return zh - xh
ret
EPILOGUE()
+
+ RODATA
+ ALIGN(2)
+approx_tab:
+ .value 0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900
+ .value 0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180
+ .value 0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0
+ .value 0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400
+ .value 0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00
+ .value 0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800
+ .value 0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280
+ .value 0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40
+ .value 0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840
+ .value 0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380
+ .value 0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0
+ .value 0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80
+ .value 0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640
+ .value 0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240
+ .value 0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80
+ .value 0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0
+ .value 0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740
+ .value 0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400
+ .value 0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0
+ .value 0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0
+ .value 0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0
+ .value 0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0
+ .value 0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500
+ .value 0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240
+ .value 0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0
+ .value 0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40
+ .value 0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00
+ .value 0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880
+ .value 0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640
+ .value 0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440
+ .value 0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200
+ .value 0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000
ASM_END()
diff --git a/gmp/mpn/x86_64/invert_limb_table.asm b/gmp/mpn/x86_64/invert_limb_table.asm
deleted file mode 100644
index 739d59e46c..0000000000
--- a/gmp/mpn/x86_64/invert_limb_table.asm
+++ /dev/null
@@ -1,50 +0,0 @@
-dnl Table used for mpn_invert_limb
-
-dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
-
-dnl Copyright 2004, 2007-2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-PROTECT(`mpn_invert_limb_table')
-
-ASM_START()
-C Table entry X contains floor (0x7fd00 / (0x100 + X))
-
- RODATA
- ALIGN(2)
- GLOBL mpn_invert_limb_table
-mpn_invert_limb_table:
-forloop(i,256,512-1,dnl
-` .value eval(0x7fd00/i)
-')dnl
-ASM_END()
diff --git a/gmp/mpn/x86_64/k10/gcd_1.asm b/gmp/mpn/x86_64/k10/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab1..0000000000
--- a/gmp/mpn/x86_64/k10/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/gmp/mpn/x86_64/k10/gmp-mparam.h b/gmp/mpn/x86_64/k10/gmp-mparam.h
deleted file mode 100644
index 5881306a40..0000000000
--- a/gmp/mpn/x86_64/k10/gmp-mparam.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* AMD K10 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2012, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-#if 0
-#undef mpn_sublsh_n
-#define mpn_sublsh_n(rp,up,vp,n,c) \
- (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \
- : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
-#endif
-
-/* 3200 MHz K10 Thuban */
-/* FFT tuning limit = 100000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 15
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 242
-#define MUL_TOOM6H_THRESHOLD 369
-#define MUL_TOOM8H_THRESHOLD 478
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 154
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 163
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 114
-#define SQR_TOOM4_THRESHOLD 390
-#define SQR_TOOM6_THRESHOLD 446
-#define SQR_TOOM8_THRESHOLD 547
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 17
-
-#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 29, 7}, { 15, 6}, { 31, 7}, { 29, 8}, \
- { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
- { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
- { 23, 7}, { 47, 8}, { 25, 7}, { 51, 8}, \
- { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
- { 55,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
- { 43,10}, { 23, 9}, { 55,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 87,11}, \
- { 47,10}, { 111,12}, { 31,11}, { 63,10}, \
- { 135,11}, { 79,10}, { 167, 8}, { 671,11}, \
- { 111,12}, { 63,11}, { 159,12}, { 95,11}, \
- { 207,10}, { 415,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 271,12}, { 159,11}, \
- { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
- { 367,12}, { 191,11}, { 415,12}, { 223,13}, \
- { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
- { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \
- { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
- { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
- { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \
- { 671,11}, { 1343,12}, { 735,13}, { 383,12}, \
- { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \
- { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
- { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
- { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \
- { 767,13}, { 1727,14}, { 895,13}, { 1855,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2431,14}, { 1279,13}, { 2559,14}, { 1407,15}, \
- { 767,14}, { 1535,13}, { 3071,14}, { 1791,16}, \
- { 511,15}, { 1023,14}, { 2431,15}, { 1279,14}, \
- { 2815,15}, { 1535,14}, { 3199,15}, { 1791,14}, \
- { 3583,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
- { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \
- { 2815,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 169
-#define MUL_FFT_THRESHOLD 7808
-
-#define SQR_FFT_MODF_THRESHOLD 448 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 448, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 29, 7}, { 15, 6}, \
- { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
- { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
- { 21, 7}, { 43, 8}, { 25, 7}, { 51, 8}, \
- { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
- { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
- { 55,10}, { 15, 9}, { 31, 8}, { 65, 9}, \
- { 43,10}, { 23, 9}, { 55,11}, { 15,10}, \
- { 31, 9}, { 67,10}, { 39, 9}, { 83,10}, \
- { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
- { 79,11}, { 47,10}, { 103,12}, { 31,11}, \
- { 63,10}, { 135,11}, { 79,10}, { 159,11}, \
- { 95,10}, { 191,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143, 9}, { 575,10}, \
- { 303, 9}, { 607,12}, { 95,11}, { 191, 9}, \
- { 767,10}, { 399,11}, { 207,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 543, 9}, { 1087,10}, \
- { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 335,10}, { 671,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 415,10}, { 831,12}, \
- { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
- { 543,10}, { 1087,12}, { 287,11}, { 607,12}, \
- { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
- { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \
- { 415,11}, { 863,12}, { 447,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
- { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \
- { 383,12}, { 799,11}, { 1599,12}, { 863,13}, \
- { 447,12}, { 927,14}, { 255,13}, { 511,12}, \
- { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
- { 1343,13}, { 703,12}, { 1407,14}, { 383,13}, \
- { 767,12}, { 1535,13}, { 831,12}, { 1727,13}, \
- { 895,12}, { 1791,13}, { 959,15}, { 255,14}, \
- { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
- { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \
- { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2303,14}, { 1407,15}, \
- { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \
- { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \
- { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \
- { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
- { 5247,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 185
-#define SQR_FFT_THRESHOLD 5568
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 61
-#define MULLO_MUL_N_THRESHOLD 15604
-
-#define DC_DIV_QR_THRESHOLD 56
-#define DC_DIVAPPR_Q_THRESHOLD 218
-#define DC_BDIV_QR_THRESHOLD 52
-#define DC_BDIV_Q_THRESHOLD 42
-
-#define INV_MULMOD_BNM1_THRESHOLD 62
-#define INV_NEWTON_THRESHOLD 226
-#define INV_APPR_THRESHOLD 220
-
-#define BINV_NEWTON_THRESHOLD 327
-#define REDC_1_TO_REDC_2_THRESHOLD 51
-#define REDC_2_TO_REDC_N_THRESHOLD 66
-
-#define MU_DIV_QR_THRESHOLD 1752
-#define MU_DIVAPPR_Q_THRESHOLD 1718
-#define MUPI_DIV_QR_THRESHOLD 102
-#define MU_BDIV_QR_THRESHOLD 1528
-#define MU_BDIV_Q_THRESHOLD 1718
-
-#define POWM_SEC_TABLE 1,22,110,624,1985
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 147
-#define HGCD_APPR_THRESHOLD 181
-#define HGCD_REDUCE_THRESHOLD 3524
-#define GCD_DC_THRESHOLD 622
-#define GCDEXT_DC_THRESHOLD 487
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 29
-#define SET_STR_DC_THRESHOLD 268
-#define SET_STR_PRECOMPUTE_THRESHOLD 1718
-
-#define FAC_DSC_THRESHOLD 1075
-#define FAC_ODD_THRESHOLD 23
diff --git a/gmp/mpn/x86_64/k10/hamdist.asm b/gmp/mpn/x86_64/k10/hamdist.asm
deleted file mode 100644
index 44b67b5e4e..0000000000
--- a/gmp/mpn/x86_64/k10/hamdist.asm
+++ /dev/null
@@ -1,103 +0,0 @@
-dnl AMD64 mpn_hamdist -- hamming distance.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 2
-C Intel P4 n/a
-C Intel core2 n/a
-C Intel corei 2.05
-C Intel atom n/a
-C VIA nano n/a
-
-C This is very straightforward 2-way unrolled code.
-
-C TODO
-C * Write something less basic. It should not be hard to reach 1.5 c/l with
-C 4-way unrolling.
-
-define(`ap', `%rdi')
-define(`bp', `%rsi')
-define(`n', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_hamdist)
- FUNC_ENTRY(3)
- mov (ap), %r8
- xor (bp), %r8
-
- lea (ap,n,8), ap C point at A operand end
- lea (bp,n,8), bp C point at B operand end
- neg n
-
- bt $0, R32(n)
- jnc L(2)
-
-L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
- xor R32(%r10), R32(%r10)
- add $1, n
- js L(top)
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(2): mov 8(ap,n,8), %r9
- .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
- xor 8(bp,n,8), %r9
- .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
- add $2, n
- js L(top)
- lea (%r10, %rax), %rax
- FUNC_EXIT()
- ret
-
- ALIGN(16)
-L(top): mov (ap,n,8), %r8
- lea (%r10, %rax), %rax
- mov 8(ap,n,8), %r9
- xor (bp,n,8), %r8
- xor 8(bp,n,8), %r9
- .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx
- lea (%rcx, %rax), %rax
- .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
- add $2, n
- js L(top)
-
- lea (%r10, %rax), %rax
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k10/lshift.asm b/gmp/mpn/x86_64/k10/lshift.asm
deleted file mode 100644
index a1cbc31f61..0000000000
--- a/gmp/mpn/x86_64/k10/lshift.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_lshift)
-include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/k10/lshiftc.asm b/gmp/mpn/x86_64/k10/lshiftc.asm
deleted file mode 100644
index ac90edb76b..0000000000
--- a/gmp/mpn/x86_64/k10/lshiftc.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_lshiftc)
-include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/k10/popcount.asm b/gmp/mpn/x86_64/k10/popcount.asm
deleted file mode 100644
index 3814aeabf4..0000000000
--- a/gmp/mpn/x86_64/k10/popcount.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-dnl AMD64 mpn_popcount -- population count.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 n/a
-C AMD K10 1.125
-C Intel P4 n/a
-C Intel core2 n/a
-C Intel corei 1.25
-C Intel atom n/a
-C VIA nano n/a
-
-C * The zero-offset of popcount is misassembled to the offset-less form, which
-C is one byte shorter and therefore will mess up the switching code.
-C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn,
-C which is the main reason for our usage of '.byte'.
-
-C TODO
-C * Improve switching code, the current code sucks.
-
-define(`up', `%rdi')
-define(`n', `%rsi')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_popcount)
- FUNC_ENTRY(2)
-
-ifelse(1,1,`
- lea (up,n,8), up
-
-C mov R32(n), R32(%rcx)
-C neg R32(%rcx)
- imul $-1, R32(n), R32(%rcx)
- and $8-1, R32(%rcx)
-
- neg n
-
- mov R32(%rcx), R32(%rax)
- neg %rax
- lea (up,%rax,8),up
-
- xor R32(%rax), R32(%rax)
-
- lea (%rcx,%rcx,4), %rcx
-
- lea L(top)(%rip), %rdx
- lea (%rdx,%rcx,2), %rdx
- jmp *%rdx
-',`
- lea (up,n,8), up
-
- mov R32(n), R32(%rcx)
- neg R32(%rcx)
- and $8-1, R32(%rcx)
-
- neg n
-
- mov R32(%rcx), R32(%rax)
- shl $3, R32(%rax)
- sub %rax, up
-
- xor R32(%rax), R32(%rax)
-
-C add R32(%rcx), R32(%rcx) C 2x
-C lea (%rcx,%rcx,4), %rcx C 10x
- imul $10, R32(%rcx)
-
- lea L(top)(%rip), %rdx
- add %rcx, %rdx
- jmp *%rdx
-')
-
- ALIGN(32)
-L(top):
-C 0 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00 C popcnt 0(up,n,8), %r8
- add %r8, %rax
-C 7 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08 C popcnt 8(up,n,8), %r9
- add %r9, %rax
-C 6 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10 C popcnt 16(up,n,8), %r8
- add %r8, %rax
-C 5 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18 C popcnt 24(up,n,8), %r9
- add %r9, %rax
-C 4 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20 C popcnt 32(up,n,8), %r8
- add %r8, %rax
-C 3 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28 C popcnt 40(up,n,8), %r9
- add %r9, %rax
-C 2 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30 C popcnt 48(up,n,8), %r8
- add %r8, %rax
-C 1 = n mod 8
- .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38 C popcnt 56(up,n,8), %r9
- add %r9, %rax
-
- add $8, n
- js L(top)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k10/rshift.asm b/gmp/mpn/x86_64/k10/rshift.asm
deleted file mode 100644
index 4c1c0d4cde..0000000000
--- a/gmp/mpn/x86_64/k10/rshift.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_rshift)
-include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp/mpn/x86_64/k10/sec_tabselect.asm b/gmp/mpn/x86_64/k10/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/k10/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/k8/aorrlsh_n.asm b/gmp/mpn/x86_64/k8/aorrlsh_n.asm
deleted file mode 100644
index ff3a1842fd..0000000000
--- a/gmp/mpn/x86_64/k8/aorrlsh_n.asm
+++ /dev/null
@@ -1,217 +0,0 @@
-dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
-
-dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 2.87 < 3.85 for lshift + add_n
-C AMD K10 2.75 < 3.85 for lshift + add_n
-C Intel P4 22 > 7.33 for lshift + add_n
-C Intel core2 4.1 > 3.27 for lshift + add_n
-C Intel NHM 4.4 > 3.75 for lshift + add_n
-C Intel SBR 3.17 < 3.46 for lshift + add_n
-C Intel atom ? ? 8.75 for lshift + add_n
-C VIA nano 4.7 < 6.25 for lshift + add_n
-
-C TODO
-C * Can we propagate carry into rdx instead of using a special carry register?
-C That could save enough insns to get to 10 cycles/iteration.
-
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n_param', `%rcx')
-define(`cnt', `%r8')
-
-define(`vp', `%r12')
-define(`n', `%rbp')
-
-ifdef(`OPERATION_addlsh_n',`
- define(ADDSUB, `add')
- define(ADCSBB, `adc')
- define(func, mpn_addlsh_n)
-')
-ifdef(`OPERATION_rsblsh_n',`
- define(ADDSUB, `sub')
- define(ADCSBB, `sbb')
- define(func, mpn_rsblsh_n)
-')
-
-MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %r12
- push %rbp
- push %rbx
-
- mov (vp_param), %rax C load first V limb early
-
- mov $0, R32(n)
- sub n_param, n
-
- lea -16(up,n_param,8), up
- lea -16(rp,n_param,8), rp
- lea 16(vp_param,n_param,8), vp
-
- mov n_param, %r9
-
- mov %r8, %rcx
- mov $1, R32(%r8)
- shl R8(%rcx), %r8
-
- mul %r8 C initial multiply
-
- and $3, R32(%r9)
- jz L(b0)
- cmp $2, R32(%r9)
- jc L(b1)
- jz L(b2)
-
-L(b3): mov %rax, %r11
- ADDSUB 16(up,n,8), %r11
- mov -8(vp,n,8), %rax
- sbb R32(%rcx), R32(%rcx)
- mov %rdx, %rbx
- mul %r8
- or %rax, %rbx
- mov (vp,n,8), %rax
- mov %rdx, %r9
- mul %r8
- or %rax, %r9
- add $3, n
- jnz L(lo3)
- jmp L(cj3)
-
-L(b2): mov %rax, %rbx
- mov -8(vp,n,8), %rax
- mov %rdx, %r9
- mul %r8
- or %rax, %r9
- add $2, n
- jz L(cj2)
- mov %rdx, %r10
- mov -16(vp,n,8), %rax
- mul %r8
- or %rax, %r10
- xor R32(%rcx), R32(%rcx) C clear carry register
- jmp L(lo2)
-
-L(b1): mov %rax, %r9
- mov %rdx, %r10
- add $1, n
- jnz L(gt1)
- ADDSUB 8(up,n,8), %r9
- jmp L(cj1)
-L(gt1): mov -16(vp,n,8), %rax
- mul %r8
- or %rax, %r10
- mov %rdx, %r11
- mov -8(vp,n,8), %rax
- mul %r8
- or %rax, %r11
- ADDSUB 8(up,n,8), %r9
- ADCSBB 16(up,n,8), %r10
- ADCSBB 24(up,n,8), %r11
- mov (vp,n,8), %rax
- sbb R32(%rcx), R32(%rcx)
- jmp L(lo1)
-
-L(b0): mov %rax, %r10
- mov %rdx, %r11
- mov -8(vp,n,8), %rax
- mul %r8
- or %rax, %r11
- ADDSUB 16(up,n,8), %r10
- ADCSBB 24(up,n,8), %r11
- mov (vp,n,8), %rax
- sbb R32(%rcx), R32(%rcx)
- mov %rdx, %rbx
- mul %r8
- or %rax, %rbx
- mov 8(vp,n,8), %rax
- add $4, n
- jz L(end)
-
- ALIGN(8)
-L(top): mov %rdx, %r9
- mul %r8
- or %rax, %r9
- mov %r10, -16(rp,n,8)
-L(lo3): mov %rdx, %r10
- mov -16(vp,n,8), %rax
- mul %r8
- or %rax, %r10
- mov %r11, -8(rp,n,8)
-L(lo2): mov %rdx, %r11
- mov -8(vp,n,8), %rax
- mul %r8
- or %rax, %r11
- add R32(%rcx), R32(%rcx)
- ADCSBB (up,n,8), %rbx
- ADCSBB 8(up,n,8), %r9
- ADCSBB 16(up,n,8), %r10
- ADCSBB 24(up,n,8), %r11
- mov (vp,n,8), %rax
- sbb R32(%rcx), R32(%rcx)
- mov %rbx, (rp,n,8)
-L(lo1): mov %rdx, %rbx
- mul %r8
- or %rax, %rbx
- mov %r9, 8(rp,n,8)
-L(lo0): mov 8(vp,n,8), %rax
- add $4, n
- jnz L(top)
-
-L(end): mov %rdx, %r9
- mul %r8
- or %rax, %r9
- mov %r10, -16(rp,n,8)
-L(cj3): mov %r11, -8(rp,n,8)
-L(cj2): add R32(%rcx), R32(%rcx)
- ADCSBB (up,n,8), %rbx
- ADCSBB 8(up,n,8), %r9
- mov %rbx, (rp,n,8)
-L(cj1): mov %r9, 8(rp,n,8)
- mov %rdx, %rax
- ADCSBB $0, %rax
- pop %rbx
- pop %rbp
- pop %r12
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm b/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm
deleted file mode 100644
index 861402b222..0000000000
--- a/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm
+++ /dev/null
@@ -1,249 +0,0 @@
-dnl x86-64 mpn_div_qr_1n_pi1
-dnl -- Divide an mpn number by a normalized single-limb number,
-dnl using a single-limb inverse.
-
-dnl Contributed to the GNU project by Niels Möller
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C c/l
-C AMD K8,K9 11
-C AMD K10 11
-C AMD bull 16
-C AMD pile 14.25
-C AMD steam ?
-C AMD bobcat 16
-C AMD jaguar ?
-C Intel P4 47.5 poor
-C Intel core 28.5 very poor
-C Intel NHM 29 very poor
-C Intel SBR 16 poor
-C Intel IBR 13.5
-C Intel HWL 12
-C Intel BWL ?
-C Intel atom 53 very poor
-C VIA nano 19
-
-
-C INPUT Parameters
-define(`QP', `%rdi')
-define(`UP', `%rsi')
-define(`UN_INPUT', `%rdx')
-define(`U1', `%rcx') C Also in %rax
-define(`D', `%r8')
-define(`DINV', `%r9')
-
-C Invariants
-define(`B2', `%rbp')
-define(`B2md', `%rbx')
-
-C Variables
-define(`UN', `%r8') C Overlaps D input
-define(`T', `%r10')
-define(`U0', `%r11')
-define(`U2', `%r12')
-define(`Q0', `%r13')
-define(`Q1', `%r14')
-define(`Q2', `%r15')
-
-ABI_SUPPORT(STD64)
-
- ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_div_qr_1n_pi1)
- FUNC_ENTRY(6)
-IFDOS(` mov 56(%rsp), %r8 ')
-IFDOS(` mov 64(%rsp), %r9 ')
- dec UN_INPUT
- jnz L(first)
-
- C Just a single 2/1 division.
- C T, U0 are allocated in scratch registers
- lea 1(U1), T
- mov U1, %rax
- mul DINV
- mov (UP), U0
- add U0, %rax
- adc T, %rdx
- mov %rdx, T
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(single_div_done)
- sub D, %rax
- add $1, T
-L(single_div_done):
- mov T, (QP)
- FUNC_EXIT
- ret
-L(first):
- C FIXME: Could delay some of these until we enter the loop.
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
- push %rbp
-
- mov D, B2
- imul DINV, B2
- neg B2
- mov B2, B2md
- sub D, B2md
-
- C D not needed until final reduction
- push D
- mov UN_INPUT, UN C Clobbers D
-
- mov DINV, %rax
- mul U1
- mov %rax, Q0
- add U1, %rdx
- mov %rdx, T
-
- mov B2, %rax
- mul U1
- mov -8(UP, UN, 8), U0
- mov (UP, UN, 8), U1
- mov T, (QP, UN, 8)
- add %rax, U0
- adc %rdx, U1
- sbb U2, U2
- dec UN
- mov U1, %rax
- jz L(final)
- mov $0, R32(Q1)
-
- ALIGN(16)
-
- C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
- C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
- C is zero, and carry holds an extra copy of U2.
-L(loop):
- C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
- C Remains to add in B (U1 + c)
- cmovc DINV, Q1
- mov U2, Q2
- neg Q2
- mul DINV
- add %rdx, Q1
- adc $0, Q2
- add Q0, Q1
- mov %rax, Q0
- mov B2, %rax
- lea (B2md, U0), T
- adc $0, Q2
-
- C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
- mul U1
- and B2, U2
- add U2, U0
- cmovnc U0, T
-
- C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
- adc U1, Q1
- mov -8(UP, UN, 8), U0
- adc Q2, 8(QP, UN, 8)
- jc L(q_incr)
-L(q_incr_done):
- add %rax, U0
- mov T, %rax
- adc %rdx, %rax
- mov Q1, (QP, UN, 8)
- mov $0, R32(Q1)
- sbb U2, U2
- dec UN
- mov %rax, U1
- jnz L(loop)
-
-L(final):
- pop D
-
- mov U2, Q1
- and D, U2
- sub U2, %rax
- neg Q1
-
- mov %rax, U1
- sub D, %rax
- cmovc U1, %rax
- sbb $-1, Q1
-
- lea 1(%rax), T
- mul DINV
- add U0, %rax
- adc T, %rdx
- mov %rdx, T
- imul D, %rdx
- sub %rdx, U0
- cmp U0, %rax
- lea (U0, D), %rax
- cmovnc U0, %rax
- sbb $0, T
- cmp D, %rax
- jc L(div_done)
- sub D, %rax
- add $1, T
-L(div_done):
- add T, Q0
- mov Q0, (QP)
- adc Q1, 8(QP)
- jnc L(done)
-L(final_q_incr):
- addq $1, 16(QP)
- lea 8(QP), QP
- jc L(final_q_incr)
-
-L(done):
- pop %rbp
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- FUNC_EXIT
- ret
-
-L(q_incr):
- C U1 is not live, so use it for indexing
- lea 16(QP, UN, 8), U1
-L(q_incr_loop):
- addq $1, (U1)
- jnc L(q_incr_done)
- lea 8(U1), U1
- jmp L(q_incr_loop)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/gmp-mparam.h b/gmp/mpn/x86_64/k8/gmp-mparam.h
deleted file mode 100644
index df78c38923..0000000000
--- a/gmp/mpn/x86_64/k8/gmp-mparam.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2012, 2014 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-// #undef mpn_sublsh_n
-// #define mpn_sublsh_n(rp,up,vp,n,c) \
-// (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \
-// : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
-
-/* 2500 MHz K8 Brisbane */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 16
-
-#define MUL_TOOM22_THRESHOLD 28
-#define MUL_TOOM33_THRESHOLD 81
-#define MUL_TOOM44_THRESHOLD 242
-#define MUL_TOOM6H_THRESHOLD 345
-#define MUL_TOOM8H_THRESHOLD 482
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 161
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 175
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 166
-
-#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 129
-#define SQR_TOOM4_THRESHOLD 527
-#define SQR_TOOM6_THRESHOLD 562
-#define SQR_TOOM8_THRESHOLD 0 /* always */
-
-#define MULMID_TOOM42_THRESHOLD 36
-
-#define MULMOD_BNM1_THRESHOLD 18
-#define SQRMOD_BNM1_THRESHOLD 22
-
-#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \
- { 15, 7}, { 33, 8}, { 17, 7}, { 37, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 25, 7}, { 51, 8}, { 43, 9}, { 23, 8}, \
- { 51, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \
- { 65, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \
- { 79, 9}, { 43,10}, { 23, 9}, { 59, 8}, \
- { 119,10}, { 31, 8}, { 125, 9}, { 71,10}, \
- { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \
- { 55, 9}, { 123,11}, { 31,10}, { 63, 9}, \
- { 131,10}, { 71, 9}, { 143,10}, { 79, 9}, \
- { 159,10}, { 87,11}, { 47,10}, { 119,11}, \
- { 63,10}, { 143,11}, { 79,10}, { 175,11}, \
- { 95,10}, { 199,11}, { 111,10}, { 223,12}, \
- { 63,11}, { 143,10}, { 287, 9}, { 575,10}, \
- { 295,11}, { 159,10}, { 319,11}, { 175,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415, 9}, { 831,11}, { 223,10}, { 447,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 271,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \
- { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
- { 703,11}, { 367,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 399,10}, { 799,11}, { 415,10}, \
- { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
- { 479,13}, { 127,12}, { 255,11}, { 511,10}, \
- { 1023,11}, { 543,10}, { 1087,12}, { 287,11}, \
- { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
- { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \
- { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,12}, { 479,11}, { 959,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
- { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \
- { 1215,13}, { 319,12}, { 671,11}, { 1343,12}, \
- { 735,13}, { 383,12}, { 799,11}, { 1599,12}, \
- { 863,13}, { 447,12}, { 895,11}, { 1791,12}, \
- { 991,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
- { 1471,14}, { 383,13}, { 767,12}, { 1599,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
- { 959,12}, { 1919,14}, { 511,13}, { 1087,12}, \
- { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \
- { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
- { 2367,14}, { 1279,13}, { 2559,14}, { 1407,15}, \
- { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
- { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
- {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 205
-#define MUL_FFT_THRESHOLD 11520
-
-#define SQR_FFT_MODF_THRESHOLD 570 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
- { 14, 5}, { 29, 6}, { 16, 5}, { 33, 6}, \
- { 29, 7}, { 15, 6}, { 31, 7}, { 16, 6}, \
- { 33, 7}, { 17, 6}, { 35, 7}, { 33, 8}, \
- { 17, 7}, { 37, 8}, { 19, 7}, { 40, 8}, \
- { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
- { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
- { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \
- { 63, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \
- { 79, 9}, { 43,10}, { 23, 9}, { 55,10}, \
- { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \
- { 47, 9}, { 99,10}, { 55, 9}, { 123,11}, \
- { 31,10}, { 63, 9}, { 127,10}, { 71, 9}, \
- { 143,10}, { 87,11}, { 47,10}, { 111,12}, \
- { 31,11}, { 63,10}, { 143,11}, { 79,10}, \
- { 167,11}, { 95,10}, { 199,11}, { 111,12}, \
- { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \
- { 287, 9}, { 575, 8}, { 1151,11}, { 159,10}, \
- { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \
- { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
- { 415, 9}, { 831,10}, { 431,11}, { 223,10}, \
- { 447,13}, { 63,12}, { 127,10}, { 511, 9}, \
- { 1023,10}, { 543, 9}, { 1087,11}, { 287,10}, \
- { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \
- { 639, 9}, { 1279,11}, { 335,10}, { 671,11}, \
- { 351,10}, { 703,11}, { 367,12}, { 191,11}, \
- { 383,10}, { 767,11}, { 399,10}, { 799,11}, \
- { 415,10}, { 831,11}, { 431,12}, { 223,11}, \
- { 447,10}, { 895,11}, { 463,13}, { 127,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 639,10}, { 1279,11}, \
- { 671,12}, { 351,11}, { 703,10}, { 1407,13}, \
- { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \
- { 799,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,12}, { 479,14}, { 127,12}, { 511,11}, \
- { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
- { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
- { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
- { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
- { 799,11}, { 1599,12}, { 863,13}, { 447,12}, \
- { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
- { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
- { 1471,14}, { 383,13}, { 767,12}, { 1599,13}, \
- { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
- { 959,14}, { 511,13}, { 1023,12}, { 2047,13}, \
- { 1215,14}, { 639,13}, { 1471,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1855,15}, { 511,14}, \
- { 1023,13}, { 2111,14}, { 1151,13}, { 2303,14}, \
- { 1407,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 214
-#define SQR_FFT_THRESHOLD 5760
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 61
-#define MULLO_MUL_N_THRESHOLD 22906
-
-#define DC_DIV_QR_THRESHOLD 51
-#define DC_DIVAPPR_Q_THRESHOLD 264
-#define DC_BDIV_QR_THRESHOLD 38
-#define DC_BDIV_Q_THRESHOLD 170
-
-#define INV_MULMOD_BNM1_THRESHOLD 67
-#define INV_NEWTON_THRESHOLD 246
-#define INV_APPR_THRESHOLD 244
-
-#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_2_THRESHOLD 35
-#define REDC_2_TO_REDC_N_THRESHOLD 84
-
-#define MU_DIV_QR_THRESHOLD 2089
-#define MU_DIVAPPR_Q_THRESHOLD 1752
-#define MUPI_DIV_QR_THRESHOLD 93
-#define MU_BDIV_QR_THRESHOLD 1718
-#define MU_BDIV_Q_THRESHOLD 1895
-
-#define POWM_SEC_TABLE 2,16,194,904,2177
-
-#define MATRIX22_STRASSEN_THRESHOLD 21
-#define HGCD_THRESHOLD 148
-#define HGCD_APPR_THRESHOLD 185
-#define HGCD_REDUCE_THRESHOLD 4120
-#define GCD_DC_THRESHOLD 562
-#define GCDEXT_DC_THRESHOLD 501
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 29
-#define SET_STR_DC_THRESHOLD 268
-#define SET_STR_PRECOMPUTE_THRESHOLD 1787
-
-#define FAC_DSC_THRESHOLD 1240
-#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp/mpn/x86_64/k8/mullo_basecase.asm b/gmp/mpn/x86_64/k8/mullo_basecase.asm
deleted file mode 100644
index fa00f4234a..0000000000
--- a/gmp/mpn/x86_64/k8/mullo_basecase.asm
+++ /dev/null
@@ -1,436 +0,0 @@
-dnl AMD64 mpn_mullo_basecase.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
-
-C NOTES
-C * There is a major stupidity in that we call mpn_mul_1 initially, for a
-C large trip count. Instead, we should start with mul_2 for any operand
-C size congruence class.
-C * Stop iterating addmul_2 earlier, falling into straight-line triangle code
-C for the last 2-3 iterations.
-C * Perhaps implement n=4 special code.
-C * The reload of the outer loop jump address hurts branch prediction.
-C * The addmul_2 loop ends with an MUL whose high part is not used upon loop
-C exit.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp_param', `%rdx')
-define(`n', `%rcx')
-
-define(`vp', `%r11')
-define(`outer_addr', `%r8')
-define(`j', `%r9')
-define(`v0', `%r13')
-define(`v1', `%r14')
-define(`w0', `%rbx')
-define(`w1', `%r15')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mullo_basecase)
- FUNC_ENTRY(4)
- cmp $4, n
- jge L(gen)
- mov (up), %rax C u0
- mov (vp_param), %r8 C v0
-
- lea L(tab)(%rip), %r9
-ifdef(`PIC',
-` movslq (%r9,%rcx,4), %r10
- add %r10, %r9
- jmp *%r9
-',`
- jmp *(%r9,n,8)
-')
- JUMPTABSECT
- ALIGN(8)
-L(tab): JMPENT( L(tab), L(tab)) C not allowed
- JMPENT( L(1), L(tab)) C 1
- JMPENT( L(2), L(tab)) C 2
- JMPENT( L(3), L(tab)) C 3
-dnl JMPENT( L(0m4), L(tab)) C 4
-dnl JMPENT( L(1m4), L(tab)) C 5
-dnl JMPENT( L(2m4), L(tab)) C 6
-dnl JMPENT( L(3m4), L(tab)) C 7
-dnl JMPENT( L(0m4), L(tab)) C 8
-dnl JMPENT( L(1m4), L(tab)) C 9
-dnl JMPENT( L(2m4), L(tab)) C 10
-dnl JMPENT( L(3m4), L(tab)) C 11
- TEXT
-
-L(1): imul %r8, %rax
- mov %rax, (rp)
- FUNC_EXIT()
- ret
-
-L(2): mov 8(vp_param), %r11
- imul %rax, %r11 C u0 x v1
- mul %r8 C u0 x v0
- mov %rax, (rp)
- imul 8(up), %r8 C u1 x v0
- lea (%r11, %rdx), %rax
- add %r8, %rax
- mov %rax, 8(rp)
- FUNC_EXIT()
- ret
-
-L(3): mov 8(vp_param), %r9 C v1
- mov 16(vp_param), %r11
- mul %r8 C u0 x v0 -> <r1,r0>
- mov %rax, (rp) C r0
- mov (up), %rax C u0
- mov %rdx, %rcx C r1
- mul %r9 C u0 x v1 -> <r2,r1>
- imul 8(up), %r9 C u1 x v1 -> r2
- mov 16(up), %r10
- imul %r8, %r10 C u2 x v0 -> r2
- add %rax, %rcx
- adc %rdx, %r9
- add %r10, %r9
- mov 8(up), %rax C u1
- mul %r8 C u1 x v0 -> <r2,r1>
- add %rax, %rcx
- adc %rdx, %r9
- mov %r11, %rax
- imul (up), %rax C u0 x v2 -> r2
- add %rax, %r9
- mov %rcx, 8(rp)
- mov %r9, 16(rp)
- FUNC_EXIT()
- ret
-
-L(0m4):
-L(1m4):
-L(2m4):
-L(3m4):
-L(gen): push %rbx
- push %rbp
- push %r13
- push %r14
- push %r15
-
- mov (up), %rax
- mov (vp_param), v0
- mov vp_param, vp
-
- lea (rp,n,8), rp
- lea (up,n,8), up
- neg n
-
- mul v0
-
- test $1, R8(n)
- jz L(mul_2)
-
-L(mul_1):
- lea -8(rp), rp
- lea -8(up), up
- test $2, R8(n)
- jnz L(mul_1_prologue_3)
-
-L(mul_1_prologue_2): C n = 7, 11, 15, ...
- lea -1(n), j
- lea L(addmul_outer_1)(%rip), outer_addr
- mov %rax, w0
- mov %rdx, w1
- xor R32(w2), R32(w2)
- xor R32(w3), R32(w3)
- mov 16(up,n,8), %rax
- jmp L(mul_1_entry_2)
-
-L(mul_1_prologue_3): C n = 5, 9, 13, ...
- lea 1(n), j
- lea L(addmul_outer_3)(%rip), outer_addr
- mov %rax, w2
- mov %rdx, w3
- xor R32(w0), R32(w0)
- jmp L(mul_1_entry_0)
-
- ALIGN(16)
-L(mul_1_top):
- mov w0, -16(rp,j,8)
- add %rax, w1
- mov (up,j,8), %rax
- adc %rdx, w2
- xor R32(w0), R32(w0)
- mul v0
- mov w1, -8(rp,j,8)
- add %rax, w2
- adc %rdx, w3
-L(mul_1_entry_0):
- mov 8(up,j,8), %rax
- mul v0
- mov w2, (rp,j,8)
- add %rax, w3
- adc %rdx, w0
- mov 16(up,j,8), %rax
- mul v0
- mov w3, 8(rp,j,8)
- xor R32(w2), R32(w2) C zero
- mov w2, w3 C zero
- add %rax, w0
- mov 24(up,j,8), %rax
- mov w2, w1 C zero
- adc %rdx, w1
-L(mul_1_entry_2):
- mul v0
- add $4, j
- js L(mul_1_top)
-
- mov w0, -16(rp)
- add %rax, w1
- mov w1, -8(rp)
- adc %rdx, w2
-
- imul (up), v0
- add v0, w2
- mov w2, (rp)
-
- add $1, n
- jz L(ret)
-
- mov 8(vp), v0
- mov 16(vp), v1
-
- lea 16(up), up
- lea 8(vp), vp
- lea 24(rp), rp
-
- jmp *outer_addr
-
-
-L(mul_2):
- mov 8(vp), v1
- test $2, R8(n)
- jz L(mul_2_prologue_3)
-
- ALIGN(16)
-L(mul_2_prologue_1):
- lea 0(n), j
- mov %rax, w3
- mov %rdx, w0
- xor R32(w1), R32(w1)
- mov (up,n,8), %rax
- lea L(addmul_outer_3)(%rip), outer_addr
- jmp L(mul_2_entry_1)
-
- ALIGN(16)
-L(mul_2_prologue_3):
- lea 2(n), j
- mov $0, R32(w3)
- mov %rax, w1
- mov (up,n,8), %rax
- mov %rdx, w2
- lea L(addmul_outer_1)(%rip), outer_addr
- jmp L(mul_2_entry_3)
-
- ALIGN(16)
-L(mul_2_top):
- mov -32(up,j,8), %rax
- mul v1
- add %rax, w0
- adc %rdx, w1
- mov -24(up,j,8), %rax
- xor R32(w2), R32(w2)
- mul v0
- add %rax, w0
- mov -24(up,j,8), %rax
- adc %rdx, w1
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- mov w0, -24(rp,j,8)
- adc %rdx, w2
- mov -16(up,j,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- mov -16(up,j,8), %rax
- adc $0, R32(w3)
-L(mul_2_entry_3):
- mov $0, R32(w0)
- mov w1, -16(rp,j,8)
- mul v1
- add %rax, w2
- mov -8(up,j,8), %rax
- adc %rdx, w3
- mov $0, R32(w1)
- mul v0
- add %rax, w2
- mov -8(up,j,8), %rax
- adc %rdx, w3
- adc R32(w1), R32(w0)
- mul v1
- add %rax, w3
- mov w2, -8(rp,j,8)
- adc %rdx, w0
- mov (up,j,8), %rax
- mul v0
- add %rax, w3
- adc %rdx, w0
- adc $0, R32(w1)
-L(mul_2_entry_1):
- add $4, j
- mov w3, -32(rp,j,8)
- js L(mul_2_top)
-
- imul -16(up), v1
- add v1, w0
- imul -8(up), v0
- add v0, w0
- mov w0, -8(rp)
-
- add $2, n
- jz L(ret)
-
- mov 16(vp), v0
- mov 24(vp), v1
-
- lea 16(vp), vp
- lea 16(rp), rp
-
- jmp *outer_addr
-
-
-L(addmul_outer_1):
- lea -2(n), j
- mov -16(up,n,8), %rax
- mul v0
- mov %rax, w3
- mov -16(up,n,8), %rax
- mov %rdx, w0
- xor R32(w1), R32(w1)
- lea L(addmul_outer_3)(%rip), outer_addr
- jmp L(addmul_entry_1)
-
-L(addmul_outer_3):
- lea 0(n), j
- mov -16(up,n,8), %rax
- xor R32(w3), R32(w3)
- mul v0
- mov %rax, w1
- mov -16(up,n,8), %rax
- mov %rdx, w2
- lea L(addmul_outer_1)(%rip), outer_addr
- jmp L(addmul_entry_3)
-
- ALIGN(16)
-L(addmul_top):
- add w3, -32(rp,j,8)
- adc %rax, w0
- mov -24(up,j,8), %rax
- adc %rdx, w1
- xor R32(w2), R32(w2)
- mul v0
- add %rax, w0
- mov -24(up,j,8), %rax
- adc %rdx, w1
- adc R32(w2), R32(w2)
- mul v1
- xor R32(w3), R32(w3)
- add w0, -24(rp,j,8)
- adc %rax, w1
- mov -16(up,j,8), %rax
- adc %rdx, w2
- mul v0
- add %rax, w1
- mov -16(up,j,8), %rax
- adc %rdx, w2
- adc $0, R32(w3)
-L(addmul_entry_3):
- mul v1
- add w1, -16(rp,j,8)
- adc %rax, w2
- mov -8(up,j,8), %rax
- adc %rdx, w3
- mul v0
- xor R32(w0), R32(w0)
- add %rax, w2
- adc %rdx, w3
- mov $0, R32(w1)
- mov -8(up,j,8), %rax
- adc R32(w1), R32(w0)
- mul v1
- add w2, -8(rp,j,8)
- adc %rax, w3
- adc %rdx, w0
- mov (up,j,8), %rax
- mul v0
- add %rax, w3
- mov (up,j,8), %rax
- adc %rdx, w0
- adc $0, R32(w1)
-L(addmul_entry_1):
- mul v1
- add $4, j
- js L(addmul_top)
-
- add w3, -32(rp)
- adc %rax, w0
-
- imul -24(up), v0
- add v0, w0
- add w0, -24(rp)
-
- add $2, n
- jns L(ret)
-
- lea 16(vp), vp
-
- mov (vp), v0
- mov 8(vp), v1
-
- lea -16(up), up
-
- jmp *outer_addr
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/mulmid_basecase.asm b/gmp/mpn/x86_64/k8/mulmid_basecase.asm
deleted file mode 100644
index 86f1414ed8..0000000000
--- a/gmp/mpn/x86_64/k8/mulmid_basecase.asm
+++ /dev/null
@@ -1,559 +0,0 @@
-dnl AMD64 mpn_mulmid_basecase
-
-dnl Contributed by David Harvey.
-
-dnl Copyright 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-C cycles/limb
-C K8,K9: 2.375 (2.5 when un - vn is "small")
-C K10: ?
-C P4: ?
-C P6-15: ?
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`un_param',`%rdx')
-define(`vp_param',`%rcx')
-define(`vn', `%r8')
-
-define(`v0', `%r12')
-define(`v1', `%r9')
-
-define(`w0', `%rbx')
-define(`w1', `%rcx')
-define(`w2', `%rbp')
-define(`w3', `%r10')
-
-define(`n', `%r11')
-define(`outer_addr', `%r14')
-define(`un', `%r13')
-define(`vp', `%r15')
-
-define(`vp_inner', `%r10')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mulmid_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov vp_param, vp
-
- C use un for row length (= un_param - vn + 1)
- lea 1(un_param), un
- sub vn, un
-
- lea (rp,un,8), rp
-
- cmp $4, un C TODO: needs tuning
- jc L(diagonal)
-
- lea (up,un_param,8), up
-
- test $1, vn
- jz L(mul_2)
-
-C ===========================================================
-C mul_1 for vp[0] if vn is odd
-
-L(mul_1):
- mov R32(un), R32(w0)
-
- neg un
- mov (up,un,8), %rax
- mov (vp), v0
- mul v0
-
- and $-4, un C round down to multiple of 4
- mov un, n
-
- and $3, R32(w0)
- jz L(mul_1_prologue_0)
- cmp $2, R32(w0)
- jc L(mul_1_prologue_1)
- jz L(mul_1_prologue_2)
-
-L(mul_1_prologue_3):
- mov %rax, w3
- mov %rdx, w0
- lea L(addmul_prologue_3)(%rip), outer_addr
- jmp L(mul_1_entry_3)
-
- ALIGN(16)
-L(mul_1_prologue_0):
- mov %rax, w2
- mov %rdx, w3 C note already w0 == 0
- lea L(addmul_prologue_0)(%rip), outer_addr
- jmp L(mul_1_entry_0)
-
- ALIGN(16)
-L(mul_1_prologue_1):
- add $4, n
- mov %rax, w1
- mov %rdx, w2
- mov $0, R32(w3)
- mov (up,n,8), %rax
- lea L(addmul_prologue_1)(%rip), outer_addr
- jmp L(mul_1_entry_1)
-
- ALIGN(16)
-L(mul_1_prologue_2):
- mov %rax, w0
- mov %rdx, w1
- mov 24(up,n,8), %rax
- mov $0, R32(w2)
- mov $0, R32(w3)
- lea L(addmul_prologue_2)(%rip), outer_addr
- jmp L(mul_1_entry_2)
-
-
- C this loop is 10 c/loop = 2.5 c/l on K8
-
- ALIGN(16)
-L(mul_1_top):
- mov w0, -16(rp,n,8)
- add %rax, w1
- mov (up,n,8), %rax
- adc %rdx, w2
-L(mul_1_entry_1):
- mov $0, R32(w0)
- mul v0
- mov w1, -8(rp,n,8)
- add %rax, w2
- adc %rdx, w3
-L(mul_1_entry_0):
- mov 8(up,n,8), %rax
- mul v0
- mov w2, (rp,n,8)
- add %rax, w3
- adc %rdx, w0
-L(mul_1_entry_3):
- mov 16(up,n,8), %rax
- mul v0
- mov w3, 8(rp,n,8)
- mov $0, R32(w2) C zero
- mov w2, w3 C zero
- add %rax, w0
- mov 24(up,n,8), %rax
- mov w2, w1 C zero
- adc %rdx, w1
-L(mul_1_entry_2):
- mul v0
- add $4, n
- js L(mul_1_top)
-
- mov w0, -16(rp)
- add %rax, w1
- mov w1, -8(rp)
- mov w2, 8(rp) C zero last limb of output
- adc %rdx, w2
- mov w2, (rp)
-
- dec vn
- jz L(ret)
-
- lea -8(up), up
- lea 8(vp), vp
-
- mov un, n
- mov (vp), v0
- mov 8(vp), v1
-
- jmp *outer_addr
-
-C ===========================================================
-C mul_2 for vp[0], vp[1] if vn is even
-
- ALIGN(16)
-L(mul_2):
- mov R32(un), R32(w0)
-
- neg un
- mov -8(up,un,8), %rax
- mov (vp), v0
- mov 8(vp), v1
- mul v1
-
- and $-4, un C round down to multiple of 4
- mov un, n
-
- and $3, R32(w0)
- jz L(mul_2_prologue_0)
- cmp $2, R32(w0)
- jc L(mul_2_prologue_1)
- jz L(mul_2_prologue_2)
-
-L(mul_2_prologue_3):
- mov %rax, w1
- mov %rdx, w2
- lea L(addmul_prologue_3)(%rip), outer_addr
- jmp L(mul_2_entry_3)
-
- ALIGN(16)
-L(mul_2_prologue_0):
- mov %rax, w0
- mov %rdx, w1
- lea L(addmul_prologue_0)(%rip), outer_addr
- jmp L(mul_2_entry_0)
-
- ALIGN(16)
-L(mul_2_prologue_1):
- mov %rax, w3
- mov %rdx, w0
- mov $0, R32(w1)
- lea L(addmul_prologue_1)(%rip), outer_addr
- jmp L(mul_2_entry_1)
-
- ALIGN(16)
-L(mul_2_prologue_2):
- mov %rax, w2
- mov %rdx, w3
- mov $0, R32(w0)
- mov 16(up,n,8), %rax
- lea L(addmul_prologue_2)(%rip), outer_addr
- jmp L(mul_2_entry_2)
-
-
- C this loop is 18 c/loop = 2.25 c/l on K8
-
- ALIGN(16)
-L(mul_2_top):
- mov -8(up,n,8), %rax
- mul v1
- add %rax, w0
- adc %rdx, w1
-L(mul_2_entry_0):
- mov $0, R32(w2)
- mov (up,n,8), %rax
- mul v0
- add %rax, w0
- mov (up,n,8), %rax
- adc %rdx, w1
- adc $0, R32(w2)
- mul v1
- add %rax, w1
- mov w0, (rp,n,8)
- adc %rdx, w2
-L(mul_2_entry_3):
- mov 8(up,n,8), %rax
- mul v0
- mov $0, R32(w3)
- add %rax, w1
- adc %rdx, w2
- mov $0, R32(w0)
- adc $0, R32(w3)
- mov 8(up,n,8), %rax
- mov w1, 8(rp,n,8)
- mul v1
- add %rax, w2
- mov 16(up,n,8), %rax
- adc %rdx, w3
-L(mul_2_entry_2):
- mov $0, R32(w1)
- mul v0
- add %rax, w2
- mov 16(up,n,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- add %rax, w3
- mov w2, 16(rp,n,8)
- adc %rdx, w0
-L(mul_2_entry_1):
- mov 24(up,n,8), %rax
- mul v0
- add %rax, w3
- adc %rdx, w0
- adc $0, R32(w1)
- add $4, n
- mov w3, -8(rp,n,8)
- jnz L(mul_2_top)
-
- mov w0, (rp)
- mov w1, 8(rp)
-
- sub $2, vn
- jz L(ret)
-
- lea 16(vp), vp
- lea -16(up), up
-
- mov un, n
- mov (vp), v0
- mov 8(vp), v1
-
- jmp *outer_addr
-
-C ===========================================================
-C addmul_2 for remaining vp's
-
- ALIGN(16)
-L(addmul_prologue_0):
- mov -8(up,n,8), %rax
- mul v1
- mov %rax, w1
- mov %rdx, w2
- mov $0, R32(w3)
- jmp L(addmul_entry_0)
-
- ALIGN(16)
-L(addmul_prologue_1):
- mov 16(up,n,8), %rax
- mul v1
- mov %rax, w0
- mov %rdx, w1
- mov $0, R32(w2)
- mov 24(up,n,8), %rax
- jmp L(addmul_entry_1)
-
- ALIGN(16)
-L(addmul_prologue_2):
- mov 8(up,n,8), %rax
- mul v1
- mov %rax, w3
- mov %rdx, w0
- mov $0, R32(w1)
- jmp L(addmul_entry_2)
-
- ALIGN(16)
-L(addmul_prologue_3):
- mov (up,n,8), %rax
- mul v1
- mov %rax, w2
- mov %rdx, w3
- mov $0, R32(w0)
- mov $0, R32(w1)
- jmp L(addmul_entry_3)
-
- C this loop is 19 c/loop = 2.375 c/l on K8
-
- ALIGN(16)
-L(addmul_top):
- mov $0, R32(w3)
- add %rax, w0
- mov -8(up,n,8), %rax
- adc %rdx, w1
- adc $0, R32(w2)
- mul v1
- add w0, -8(rp,n,8)
- adc %rax, w1
- adc %rdx, w2
-L(addmul_entry_0):
- mov (up,n,8), %rax
- mul v0
- add %rax, w1
- mov (up,n,8), %rax
- adc %rdx, w2
- adc $0, R32(w3)
- mul v1
- add w1, (rp,n,8)
- mov $0, R32(w1)
- adc %rax, w2
- mov $0, R32(w0)
- adc %rdx, w3
-L(addmul_entry_3):
- mov 8(up,n,8), %rax
- mul v0
- add %rax, w2
- mov 8(up,n,8), %rax
- adc %rdx, w3
- adc $0, R32(w0)
- mul v1
- add w2, 8(rp,n,8)
- adc %rax, w3
- adc %rdx, w0
-L(addmul_entry_2):
- mov 16(up,n,8), %rax
- mul v0
- add %rax, w3
- mov 16(up,n,8), %rax
- adc %rdx, w0
- adc $0, R32(w1)
- mul v1
- add w3, 16(rp,n,8)
- nop C don't ask...
- adc %rax, w0
- mov $0, R32(w2)
- mov 24(up,n,8), %rax
- adc %rdx, w1
-L(addmul_entry_1):
- mul v0
- add $4, n
- jnz L(addmul_top)
-
- add %rax, w0
- adc %rdx, w1
- adc $0, R32(w2)
-
- add w0, -8(rp)
- adc w1, (rp)
- adc w2, 8(rp)
-
- sub $2, vn
- jz L(ret)
-
- lea 16(vp), vp
- lea -16(up), up
-
- mov un, n
- mov (vp), v0
- mov 8(vp), v1
-
- jmp *outer_addr
-
-C ===========================================================
-C accumulate along diagonals if un - vn is small
-
- ALIGN(16)
-L(diagonal):
- xor R32(w0), R32(w0)
- xor R32(w1), R32(w1)
- xor R32(w2), R32(w2)
-
- neg un
-
- mov R32(vn), %eax
- and $3, %eax
- jz L(diag_prologue_0)
- cmp $2, %eax
- jc L(diag_prologue_1)
- jz L(diag_prologue_2)
-
-L(diag_prologue_3):
- lea -8(vp), vp
- mov vp, vp_inner
- add $1, vn
- mov vn, n
- lea L(diag_entry_3)(%rip), outer_addr
- jmp L(diag_entry_3)
-
-L(diag_prologue_0):
- mov vp, vp_inner
- mov vn, n
- lea 0(%rip), outer_addr
- mov -8(up,n,8), %rax
- jmp L(diag_entry_0)
-
-L(diag_prologue_1):
- lea 8(vp), vp
- mov vp, vp_inner
- add $3, vn
- mov vn, n
- lea 0(%rip), outer_addr
- mov -8(vp_inner), %rax
- jmp L(diag_entry_1)
-
-L(diag_prologue_2):
- lea -16(vp), vp
- mov vp, vp_inner
- add $2, vn
- mov vn, n
- lea 0(%rip), outer_addr
- mov 16(vp_inner), %rax
- jmp L(diag_entry_2)
-
-
- C this loop is 10 c/loop = 2.5 c/l on K8
-
- ALIGN(16)
-L(diag_top):
- add %rax, w0
- adc %rdx, w1
- mov -8(up,n,8), %rax
- adc $0, w2
-L(diag_entry_0):
- mulq (vp_inner)
- add %rax, w0
- adc %rdx, w1
- adc $0, w2
-L(diag_entry_3):
- mov -16(up,n,8), %rax
- mulq 8(vp_inner)
- add %rax, w0
- mov 16(vp_inner), %rax
- adc %rdx, w1
- adc $0, w2
-L(diag_entry_2):
- mulq -24(up,n,8)
- add %rax, w0
- mov 24(vp_inner), %rax
- adc %rdx, w1
- lea 32(vp_inner), vp_inner
- adc $0, w2
-L(diag_entry_1):
- mulq -32(up,n,8)
- sub $4, n
- jnz L(diag_top)
-
- add %rax, w0
- adc %rdx, w1
- adc $0, w2
-
- mov w0, (rp,un,8)
-
- inc un
- jz L(diag_end)
-
- mov vn, n
- mov vp, vp_inner
-
- lea 8(up), up
- mov w1, w0
- mov w2, w1
- xor R32(w2), R32(w2)
-
- jmp *outer_addr
-
-L(diag_end):
- mov w1, (rp)
- mov w2, 8(rp)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/redc_1.asm b/gmp/mpn/x86_64/k8/redc_1.asm
deleted file mode 100644
index 74538986f9..0000000000
--- a/gmp/mpn/x86_64/k8/redc_1.asm
+++ /dev/null
@@ -1,590 +0,0 @@
-dnl X86-64 mpn_redc_1 optimised for AMD K8-K10.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bull ?
-C AMD pile ?
-C AMD steam ?
-C AMD bobcat ?
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core ?
-C Intel NHM ?
-C Intel SBR ?
-C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom ?
-C VIA nano ?
-
-C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjörn Granlund.
-
-C TODO
-C * Micro-optimise, none performed thus far.
-C * This looks different from other current redc_1.asm variants. Consider
-C adapting this to the mainstream style.
-C * Is this code really faster than more approaches which compute q0 later?
-C Is the use of a jump jump table faster? Or is the edge of this due to the
-C inlined add_n code?
-C * Put initial m[0] x q0 computation in header.
-C * Put basecases at the file's end, single them out before the pushes.
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`mp_param', `%rdx') C r8
-define(`n', `%rcx') C r9
-define(`u0inv', `%r8') C stack
-
-define(`i', `%r11')
-define(`nneg', `%r12')
-define(`mp', `%r13')
-define(`q0', `%rbp')
-define(`vp', `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_redc_1)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbp
- mov (up), q0 C up[0]
- push %rbx
- imul u0inv, q0 C first q0, for all execution paths
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov n, nneg
- neg nneg
- lea (mp_param,n,8), mp C mp += n
- lea -16(up,n,8), up C up += n
-
- mov R32(n), R32(%rax)
- and $3, R32(%rax)
- lea 4(%rax), %r9
- cmp $4, R32(n)
- cmovg %r9, %rax
- lea L(tab)(%rip), %r9
-ifdef(`PIC',`
- movslq (%r9,%rax,4), %rax
- add %r9, %rax
- jmp *%rax
-',`
- jmp *(%r9,%rax,8)
-')
-
- JUMPTABSECT
- ALIGN(8)
-L(tab): JMPENT( L(0m4), L(tab))
- JMPENT( L(1), L(tab))
- JMPENT( L(2), L(tab))
- JMPENT( L(3), L(tab))
- JMPENT( L(0m4), L(tab))
- JMPENT( L(1m4), L(tab))
- JMPENT( L(2m4), L(tab))
- JMPENT( L(3m4), L(tab))
- TEXT
-
- ALIGN(16)
-L(1): mov (mp_param), %rax
- mul q0
- add 8(up), %rax
- adc 16(up), %rdx
- mov %rdx, (rp)
- mov $0, R32(%rax)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-
- ALIGN(16)
-L(2): mov (mp_param), %rax
- mul q0
- xor R32(%r14), R32(%r14)
- mov %rax, %r10
- mov -8(mp), %rax
- mov %rdx, %r9
- mul q0
- add (up), %r10
- adc %rax, %r9
- adc %rdx, %r14
- add 8(up), %r9
- adc $0, %r14
- mov %r9, q0
- imul u0inv, q0
- mov -16(mp), %rax
- mul q0
- xor R32(%rbx), R32(%rbx)
- mov %rax, %r10
- mov -8(mp), %rax
- mov %rdx, %r11
- mul q0
- add %r9, %r10
- adc %rax, %r11
- adc %rdx, %rbx
- add 16(up), %r11
- adc $0, %rbx
- xor R32(%rax), R32(%rax)
- add %r11, %r14
- adc 24(up), %rbx
- mov %r14, (rp)
- mov %rbx, 8(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-
-L(3): mov (mp_param), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, %r10
- mov -16(mp), %rax
- mul q0
- xor R32(%r9), R32(%r9)
- xor R32(%r14), R32(%r14)
- add -8(up), %rbx
- adc %rax, %r10
- mov -8(mp), %rax
- adc %rdx, %r9
- mul q0
- add (up), %r10
- mov %r10, (up)
- adc %rax, %r9
- adc %rdx, %r14
- mov %r10, q0
- imul u0inv, q0
- add %r9, 8(up)
- adc $0, %r14
- mov %r14, -8(up)
-
- mov -24(mp), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, %r10
- mov -16(mp), %rax
- mul q0
- xor R32(%r9), R32(%r9)
- xor R32(%r14), R32(%r14)
- add (up), %rbx
- adc %rax, %r10
- mov -8(mp), %rax
- adc %rdx, %r9
- mul q0
- add 8(up), %r10
- mov %r10, 8(up)
- adc %rax, %r9
- adc %rdx, %r14
- mov %r10, q0
- imul u0inv, q0
- add %r9, 16(up)
- adc $0, %r14
- mov %r14, (up)
-
- mov -24(mp), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, %r10
- mov -16(mp), %rax
- mul q0
- xor R32(%r9), R32(%r9)
- xor R32(%r14), R32(%r14)
- add 8(up), %rbx
- adc %rax, %r10
- mov -8(mp), %rax
- adc %rdx, %r9
- mul q0
- add 16(up), %r10
- adc %rax, %r9
- adc %rdx, %r14
- add 24(up), %r9
- adc $0, %r14
-
- xor R32(%rax), R32(%rax)
- add -8(up), %r10
- adc (up), %r9
- adc 32(up), %r14
- mov %r10, (rp)
- mov %r9, 8(rp)
- mov %r14, 16(rp)
- adc R32(%rax), R32(%rax)
- jmp L(ret)
-
-
- ALIGN(16)
-L(2m4):
-L(lo2): mov (mp,nneg,8), %rax
- mul q0
- xor R32(%r14), R32(%r14)
- xor R32(%rbx), R32(%rbx)
- mov %rax, %r10
- mov 8(mp,nneg,8), %rax
- mov 24(up,nneg,8), %r15
- mov %rdx, %r9
- mul q0
- add 16(up,nneg,8), %r10
- adc %rax, %r9
- mov 16(mp,nneg,8), %rax
- adc %rdx, %r14
- mul q0
- mov $0, R32(%r10) C xor?
- lea 2(nneg), i
- add %r9, %r15
- imul u0inv, %r15
- jmp L(e2)
-
- ALIGN(16)
-L(li2): add %r10, (up,i,8)
- adc %rax, %r9
- mov (mp,i,8), %rax
- adc %rdx, %r14
- xor R32(%r10), R32(%r10)
- mul q0
-L(e2): add %r9, 8(up,i,8)
- adc %rax, %r14
- adc %rdx, %rbx
- mov 8(mp,i,8), %rax
- mul q0
- add %r14, 16(up,i,8)
- adc %rax, %rbx
- adc %rdx, %r10
- mov 16(mp,i,8), %rax
- mul q0
- add %rbx, 24(up,i,8)
- mov $0, R32(%r14) C zero
- mov %r14, %rbx C zero
- adc %rax, %r10
- mov 24(mp,i,8), %rax
- mov %r14, %r9 C zero
- adc %rdx, %r9
- mul q0
- add $4, i
- js L(li2)
-
-L(le2): add %r10, (up)
- adc %rax, %r9
- adc %r14, %rdx
- add %r9, 8(up)
- adc $0, %rdx
- mov %rdx, 16(up,nneg,8) C up[0]
- add $8, up
- mov %r15, q0
- dec n
- jnz L(lo2)
-
- mov nneg, n
- sar $2, n
- lea 32(up,nneg,8), up
- lea (up,nneg,8), vp
-
- mov -16(up), %r8
- mov -8(up), %r9
- add -16(vp), %r8
- adc -8(vp), %r9
- mov %r8, (rp)
- mov %r9, 8(rp)
- lea 16(rp), rp
- jmp L(addx)
-
-
- ALIGN(16)
-L(1m4):
-L(lo1): mov (mp,nneg,8), %rax
- xor %r9, %r9
- xor R32(%rbx), R32(%rbx)
- mul q0
- mov %rax, %r9
- mov 8(mp,nneg,8), %rax
- mov 24(up,nneg,8), %r15
- mov %rdx, %r14
- mov $0, R32(%r10) C xor?
- mul q0
- add 16(up,nneg,8), %r9
- adc %rax, %r14
- adc %rdx, %rbx
- mov 16(mp,nneg,8), %rax
- mul q0
- lea 1(nneg), i
- add %r14, %r15
- imul u0inv, %r15
- jmp L(e1)
-
- ALIGN(16)
-L(li1): add %r10, (up,i,8)
- adc %rax, %r9
- mov (mp,i,8), %rax
- adc %rdx, %r14
- xor R32(%r10), R32(%r10)
- mul q0
- add %r9, 8(up,i,8)
- adc %rax, %r14
- adc %rdx, %rbx
- mov 8(mp,i,8), %rax
- mul q0
-L(e1): add %r14, 16(up,i,8)
- adc %rax, %rbx
- adc %rdx, %r10
- mov 16(mp,i,8), %rax
- mul q0
- add %rbx, 24(up,i,8)
- mov $0, R32(%r14) C zero
- mov %r14, %rbx C zero
- adc %rax, %r10
- mov 24(mp,i,8), %rax
- mov %r14, %r9 C zero
- adc %rdx, %r9
- mul q0
- add $4, i
- js L(li1)
-
-L(le1): add %r10, (up)
- adc %rax, %r9
- adc %r14, %rdx
- add %r9, 8(up)
- adc $0, %rdx
- mov %rdx, 16(up,nneg,8) C up[0]
- add $8, up
- mov %r15, q0
- dec n
- jnz L(lo1)
-
- mov nneg, n
- sar $2, n
- lea 24(up,nneg,8), up
- lea (up,nneg,8), vp
-
- mov -8(up), %r8
- add -8(vp), %r8
- mov %r8, (rp)
- lea 8(rp), rp
- jmp L(addx)
-
-
- ALIGN(16)
-L(0m4):
-L(lo0): mov (mp,nneg,8), %rax
- mov nneg, i
- mul q0
- xor R32(%r10), R32(%r10)
- mov %rax, %r14
- mov %rdx, %rbx
- mov 8(mp,nneg,8), %rax
- mov 24(up,nneg,8), %r15
- mul q0
- add 16(up,nneg,8), %r14
- adc %rax, %rbx
- adc %rdx, %r10
- add %rbx, %r15
- imul u0inv, %r15
- jmp L(e0)
-
- ALIGN(16)
-L(li0): add %r10, (up,i,8)
- adc %rax, %r9
- mov (mp,i,8), %rax
- adc %rdx, %r14
- xor R32(%r10), R32(%r10)
- mul q0
- add %r9, 8(up,i,8)
- adc %rax, %r14
- adc %rdx, %rbx
- mov 8(mp,i,8), %rax
- mul q0
- add %r14, 16(up,i,8)
- adc %rax, %rbx
- adc %rdx, %r10
-L(e0): mov 16(mp,i,8), %rax
- mul q0
- add %rbx, 24(up,i,8)
- mov $0, R32(%r14) C zero
- mov %r14, %rbx C zero
- adc %rax, %r10
- mov 24(mp,i,8), %rax
- mov %r14, %r9 C zero
- adc %rdx, %r9
- mul q0
- add $4, i
- js L(li0)
-
-L(le0): add %r10, (up)
- adc %rax, %r9
- adc %r14, %rdx
- add %r9, 8(up)
- adc $0, %rdx
- mov %rdx, 16(up,nneg,8) C up[0]
- add $8, up
- mov %r15, q0
- dec n
- jnz L(lo0)
-
- mov nneg, n
- sar $2, n
- clc
- lea 16(up,nneg,8), up
- lea (up,nneg,8), vp
- jmp L(addy)
-
-
- ALIGN(16)
-L(3m4):
-L(lo3): mov (mp,nneg,8), %rax
- mul q0
- mov %rax, %rbx
- mov %rdx, %r10
- mov 8(mp,nneg,8), %rax
- mov 24(up,nneg,8), %r15
- mul q0
- add 16(up,nneg,8), %rbx C result is zero, might carry
- mov $0, R32(%rbx) C zero
- mov %rbx, %r14 C zero
- adc %rax, %r10
- mov 16(mp,nneg,8), %rax
- mov %r14, %r9 C zero
- adc %rdx, %r9
- add %r10, %r15
- mul q0
- lea 3(nneg), i
- imul u0inv, %r15
-C jmp L(li3)
-
- ALIGN(16)
-L(li3): add %r10, (up,i,8)
- adc %rax, %r9
- mov (mp,i,8), %rax
- adc %rdx, %r14
- xor R32(%r10), R32(%r10)
- mul q0
- add %r9, 8(up,i,8)
- adc %rax, %r14
- adc %rdx, %rbx
- mov 8(mp,i,8), %rax
- mul q0
- add %r14, 16(up,i,8)
- adc %rax, %rbx
- adc %rdx, %r10
- mov 16(mp,i,8), %rax
- mul q0
- add %rbx, 24(up,i,8)
- mov $0, R32(%r14) C zero
- mov %r14, %rbx C zero
- adc %rax, %r10
- mov 24(mp,i,8), %rax
- mov %r14, %r9 C zero
- adc %rdx, %r9
- mul q0
- add $4, i
- js L(li3)
-
-L(le3): add %r10, (up)
- adc %rax, %r9
- adc %r14, %rdx
- add %r9, 8(up)
- adc $0, %rdx
- mov %rdx, 16(up,nneg,8) C up[0]
- mov %r15, q0
- lea 8(up), up
- dec n
- jnz L(lo3)
-
-
-C ==== Addition code ====
- mov nneg, n
- sar $2, n
- lea 40(up,nneg,8), up
- lea (up,nneg,8), vp
-
- mov -24(up), %r8
- mov -16(up), %r9
- mov -8(up), %r10
- add -24(vp), %r8
- adc -16(vp), %r9
- adc -8(vp), %r10
- mov %r8, (rp)
- mov %r9, 8(rp)
- mov %r10, 16(rp)
- lea 24(rp), rp
-
-L(addx):inc n
- jz L(ad3)
-
-L(addy):mov (up), %r8
- mov 8(up), %r9
- inc n
- jmp L(mid)
-
-C ALIGN(16)
-L(al3): adc (vp), %r8
- adc 8(vp), %r9
- adc 16(vp), %r10
- adc 24(vp), %r11
- mov %r8, (rp)
- lea 32(up), up
- mov %r9, 8(rp)
- mov %r10, 16(rp)
- inc n
- mov %r11, 24(rp)
- lea 32(vp), vp
- mov (up), %r8
- mov 8(up), %r9
- lea 32(rp), rp
-L(mid): mov 16(up), %r10
- mov 24(up), %r11
- jnz L(al3)
-
-L(ae3): adc (vp), %r8
- adc 8(vp), %r9
- adc 16(vp), %r10
- adc 24(vp), %r11
- mov %r8, (rp)
- mov %r9, 8(rp)
- mov %r10, 16(rp)
- mov %r11, 24(rp)
-
-L(ad3): mov R32(n), R32(%rax) C zero
- adc R32(%rax), R32(%rax)
-
-L(ret): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/logops_n.asm b/gmp/mpn/x86_64/logops_n.asm
index b277f58962..1022b61376 100644
--- a/gmp/mpn/x86_64/logops_n.asm
+++ b/gmp/mpn/x86_64/logops_n.asm
@@ -1,45 +1,30 @@
dnl AMD64 logops.
-dnl Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 1.5 with fluctuations for variant 2 and 3
-C AMD K10 1.5 with fluctuations for all variants
-C Intel P4 2.8/3.35/3.60 (variant1/variant2/variant3)
-C Intel core2 2
-C Intel NHM 2
-C Intel SBR 1.5/1.75/1.75
-C Intel atom 3.75
-C VIA nano 3.25
+C K8,K9: 1.5
+C K10: 1.75-2 (fluctuating)
+C P4: 2.8/3.35/3.60 (variant1/variant2/variant3)
+C P6-15: 2.0
ifdef(`OPERATION_and_n',`
define(`func',`mpn_and_n')
@@ -83,8 +68,6 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n',`%rcx')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
ASM_START()
@@ -92,16 +75,15 @@ ifdef(`VARIANT_1',`
TEXT
ALIGN(32)
PROLOGUE(func)
- FUNC_ENTRY(4)
movq (vp), %r8
- movl R32(%rcx), R32(%rax)
+ movl %ecx, %eax
leaq (vp,n,8), vp
leaq (up,n,8), up
leaq (rp,n,8), rp
negq n
- andl $3, R32(%rax)
+ andl $3, %eax
je L(b00)
- cmpl $2, R32(%rax)
+ cmpl $2, %eax
jc L(b01)
je L(b10)
@@ -131,8 +113,7 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): FUNC_EXIT()
- ret
+L(ret): ret
EPILOGUE()
')
@@ -140,17 +121,16 @@ ifdef(`VARIANT_2',`
TEXT
ALIGN(32)
PROLOGUE(func)
- FUNC_ENTRY(4)
movq (vp), %r8
notq %r8
- movl R32(%rcx), R32(%rax)
+ movl %ecx, %eax
leaq (vp,n,8), vp
leaq (up,n,8), up
leaq (rp,n,8), rp
negq n
- andl $3, R32(%rax)
+ andl $3, %eax
je L(b00)
- cmpl $2, R32(%rax)
+ cmpl $2, %eax
jc L(b01)
je L(b10)
@@ -184,8 +164,7 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): FUNC_EXIT()
- ret
+L(ret): ret
EPILOGUE()
')
@@ -193,16 +172,15 @@ ifdef(`VARIANT_3',`
TEXT
ALIGN(32)
PROLOGUE(func)
- FUNC_ENTRY(4)
movq (vp), %r8
- movl R32(%rcx), R32(%rax)
+ movl %ecx, %eax
leaq (vp,n,8), vp
leaq (up,n,8), up
leaq (rp,n,8), rp
negq n
- andl $3, R32(%rax)
+ andl $3, %eax
je L(b00)
- cmpl $2, R32(%rax)
+ cmpl $2, %eax
jc L(b01)
je L(b10)
@@ -238,7 +216,6 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): FUNC_EXIT()
- ret
+L(ret): ret
EPILOGUE()
')
diff --git a/gmp/mpn/x86_64/lshift.asm b/gmp/mpn/x86_64/lshift.asm
index f368944b85..11fe59c24e 100644
--- a/gmp/mpn/x86_64/lshift.asm
+++ b/gmp/mpn/x86_64/lshift.asm
@@ -1,45 +1,31 @@
dnl AMD64 mpn_lshift -- mpn left shift.
-dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb cycles/limb cnt=1
-C AMD K8,K9 2.375 1.375
-C AMD K10 2.375 1.375
-C Intel P4 8 10.5
-C Intel core2 2.11 4.28
-C Intel corei ? ?
-C Intel atom 5.75 3.5
-C VIA nano 3.5 2.25
+C K8,K9: 2.375 1.375
+C K10: 2.375 1.375
+C P4: 8 10.5
+C P6-15 (Core2): 2.11 4.28
+C P6-28 (Atom): 5.75 3.5
C INPUT PARAMETERS
@@ -48,19 +34,15 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
- cmp $1, R8(%rcx)
+ cmp $1, %cl
jne L(gen)
C For cnt=1 we want to work from lowest limb towards higher limbs.
-C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad.
+C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
C FIXME: this could surely be done more cleverly.
mov rp, %rax
@@ -95,30 +77,27 @@ L(t1): mov (up), %r8
dec n
jne L(t1)
- inc R32(%rax)
- dec R32(%rax)
+ inc %eax
+ dec %eax
jne L(n00)
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+ adc %eax, %eax
ret
-L(e1): test R32(%rax), R32(%rax) C clear cy
+L(e1): test %eax, %eax C clear cy
L(n00): mov (up), %r8
- dec R32(%rax)
+ dec %eax
jne L(n01)
adc %r8, %r8
mov %r8, (rp)
-L(ret): adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+L(ret): adc %eax, %eax
ret
-L(n01): dec R32(%rax)
+L(n01): dec %eax
mov 8(up), %r9
jne L(n10)
adc %r8, %r8
adc %r9, %r9
mov %r8, (rp)
mov %r9, 8(rp)
- adc R32(%rax), R32(%rax)
- FUNC_EXIT()
+ adc %eax, %eax
ret
L(n10): mov 16(up), %r10
adc %r8, %r8
@@ -127,15 +106,14 @@ L(n10): mov 16(up), %r10
mov %r8, (rp)
mov %r9, 8(rp)
mov %r10, 16(rp)
- adc $-1, R32(%rax)
- FUNC_EXIT()
+ adc $-1, %eax
ret
-L(gen): neg R32(%rcx) C put rsh count in cl
+L(gen): neg %ecx C put rsh count in cl
mov -8(up,n,8), %rax
- shr R8(%rcx), %rax C function return value
+ shr %cl, %rax C function return value
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
lea 1(n), R32(%r8)
and $3, R32(%r8)
je L(rlx) C jump for n = 3, 7, 11, ...
@@ -144,10 +122,10 @@ L(gen): neg R32(%rcx) C put rsh count in cl
jne L(1)
C n = 4, 8, 12, ...
mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
+ shl %cl, %r10
+ neg %ecx C put rsh count in cl
mov -16(up,n,8), %r8
- shr R8(%rcx), %r8
+ shr %cl, %r8
or %r8, %r10
mov %r10, -8(rp,n,8)
dec n
@@ -157,91 +135,90 @@ L(1): dec R32(%r8)
je L(1x) C jump for n = 1, 5, 9, 13, ...
C n = 2, 6, 10, 16, ...
mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
+ shl %cl, %r10
+ neg %ecx C put rsh count in cl
mov -16(up,n,8), %r8
- shr R8(%rcx), %r8
+ shr %cl, %r8
or %r8, %r10
mov %r10, -8(rp,n,8)
dec n
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
L(1x):
cmp $1, n
je L(ast)
mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
+ shl %cl, %r10
mov -16(up,n,8), %r11
- shl R8(%rcx), %r11
- neg R32(%rcx) C put rsh count in cl
+ shl %cl, %r11
+ neg %ecx C put rsh count in cl
mov -16(up,n,8), %r8
mov -24(up,n,8), %r9
- shr R8(%rcx), %r8
+ shr %cl, %r8
or %r8, %r10
- shr R8(%rcx), %r9
+ shr %cl, %r9
or %r9, %r11
mov %r10, -8(rp,n,8)
mov %r11, -16(rp,n,8)
sub $2, n
-L(rll): neg R32(%rcx) C put lsh count in cl
+L(rll): neg %ecx C put lsh count in cl
L(rlx): mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
+ shl %cl, %r10
mov -16(up,n,8), %r11
- shl R8(%rcx), %r11
+ shl %cl, %r11
sub $4, n C 4
jb L(end) C 2
ALIGN(16)
L(top):
C finish stuff from lsh block
- neg R32(%rcx) C put rsh count in cl
+ neg %ecx C put rsh count in cl
mov 16(up,n,8), %r8
mov 8(up,n,8), %r9
- shr R8(%rcx), %r8
+ shr %cl, %r8
or %r8, %r10
- shr R8(%rcx), %r9
+ shr %cl, %r9
or %r9, %r11
mov %r10, 24(rp,n,8)
mov %r11, 16(rp,n,8)
C start two new rsh
mov 0(up,n,8), %r8
mov -8(up,n,8), %r9
- shr R8(%rcx), %r8
- shr R8(%rcx), %r9
+ shr %cl, %r8
+ shr %cl, %r9
C finish stuff from rsh block
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
mov 8(up,n,8), %r10
mov 0(up,n,8), %r11
- shl R8(%rcx), %r10
+ shl %cl, %r10
or %r10, %r8
- shl R8(%rcx), %r11
+ shl %cl, %r11
or %r11, %r9
mov %r8, 8(rp,n,8)
mov %r9, 0(rp,n,8)
C start two new lsh
mov -8(up,n,8), %r10
mov -16(up,n,8), %r11
- shl R8(%rcx), %r10
- shl R8(%rcx), %r11
+ shl %cl, %r10
+ shl %cl, %r11
sub $4, n
jae L(top) C 2
L(end):
- neg R32(%rcx) C put rsh count in cl
- mov 8(up), %r8
- shr R8(%rcx), %r8
+ neg %ecx C put rsh count in cl
+ mov 16(up,n,8), %r8
+ shr %cl, %r8
or %r8, %r10
- mov (up), %r9
- shr R8(%rcx), %r9
+ mov 8(up,n,8), %r9
+ shr %cl, %r9
or %r9, %r11
- mov %r10, 16(rp)
- mov %r11, 8(rp)
+ mov %r10, 24(rp,n,8)
+ mov %r11, 16(rp,n,8)
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
L(ast): mov (up), %r10
- shl R8(%rcx), %r10
+ shl %cl, %r10
mov %r10, (rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/lshiftc.asm b/gmp/mpn/x86_64/lshiftc.asm
deleted file mode 100644
index c4ba04a173..0000000000
--- a/gmp/mpn/x86_64/lshiftc.asm
+++ /dev/null
@@ -1,182 +0,0 @@
-dnl AMD64 mpn_lshiftc -- mpn left shift with complement.
-
-dnl Copyright 2003, 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 2.75
-C AMD K10 2.75
-C Intel P4 ?
-C Intel core2 ?
-C Intel corei ?
-C Intel atom ?
-C VIA nano 3.75
-
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n', `%rdx')
-define(`cnt', `%rcx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- neg R32(%rcx) C put rsh count in cl
- mov -8(up,n,8), %rax
- shr R8(%rcx), %rax C function return value
-
- neg R32(%rcx) C put lsh count in cl
- lea 1(n), R32(%r8)
- and $3, R32(%r8)
- je L(rlx) C jump for n = 3, 7, 11, ...
-
- dec R32(%r8)
- jne L(1)
-C n = 4, 8, 12, ...
- mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
- mov -16(up,n,8), %r8
- shr R8(%rcx), %r8
- or %r8, %r10
- not %r10
- mov %r10, -8(rp,n,8)
- dec n
- jmp L(rll)
-
-L(1): dec R32(%r8)
- je L(1x) C jump for n = 1, 5, 9, 13, ...
-C n = 2, 6, 10, 16, ...
- mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
- mov -16(up,n,8), %r8
- shr R8(%rcx), %r8
- or %r8, %r10
- not %r10
- mov %r10, -8(rp,n,8)
- dec n
- neg R32(%rcx) C put lsh count in cl
-L(1x):
- cmp $1, n
- je L(ast)
- mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- mov -16(up,n,8), %r11
- shl R8(%rcx), %r11
- neg R32(%rcx) C put rsh count in cl
- mov -16(up,n,8), %r8
- mov -24(up,n,8), %r9
- shr R8(%rcx), %r8
- or %r8, %r10
- shr R8(%rcx), %r9
- or %r9, %r11
- not %r10
- not %r11
- mov %r10, -8(rp,n,8)
- mov %r11, -16(rp,n,8)
- sub $2, n
-
-L(rll): neg R32(%rcx) C put lsh count in cl
-L(rlx): mov -8(up,n,8), %r10
- shl R8(%rcx), %r10
- mov -16(up,n,8), %r11
- shl R8(%rcx), %r11
-
- sub $4, n C 4
- jb L(end) C 2
- ALIGN(16)
-L(top):
- C finish stuff from lsh block
- neg R32(%rcx) C put rsh count in cl
- mov 16(up,n,8), %r8
- mov 8(up,n,8), %r9
- shr R8(%rcx), %r8
- or %r8, %r10
- shr R8(%rcx), %r9
- or %r9, %r11
- not %r10
- not %r11
- mov %r10, 24(rp,n,8)
- mov %r11, 16(rp,n,8)
- C start two new rsh
- mov 0(up,n,8), %r8
- mov -8(up,n,8), %r9
- shr R8(%rcx), %r8
- shr R8(%rcx), %r9
-
- C finish stuff from rsh block
- neg R32(%rcx) C put lsh count in cl
- mov 8(up,n,8), %r10
- mov 0(up,n,8), %r11
- shl R8(%rcx), %r10
- or %r10, %r8
- shl R8(%rcx), %r11
- or %r11, %r9
- not %r8
- not %r9
- mov %r8, 8(rp,n,8)
- mov %r9, 0(rp,n,8)
- C start two new lsh
- mov -8(up,n,8), %r10
- mov -16(up,n,8), %r11
- shl R8(%rcx), %r10
- shl R8(%rcx), %r11
-
- sub $4, n
- jae L(top) C 2
-L(end):
- neg R32(%rcx) C put rsh count in cl
- mov 8(up), %r8
- shr R8(%rcx), %r8
- or %r8, %r10
- mov (up), %r9
- shr R8(%rcx), %r9
- or %r9, %r11
- not %r10
- not %r11
- mov %r10, 16(rp)
- mov %r11, 8(rp)
-
- neg R32(%rcx) C put lsh count in cl
-L(ast): mov (up), %r10
- shl R8(%rcx), %r10
- not %r10
- mov %r10, (rp)
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/lshsub_n.asm b/gmp/mpn/x86_64/lshsub_n.asm
index 4d428c0bd2..dc8576b220 100644
--- a/gmp/mpn/x86_64/lshsub_n.asm
+++ b/gmp/mpn/x86_64/lshsub_n.asm
@@ -1,44 +1,30 @@
dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
-dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2006 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
-C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
-C Intel P4 16.5
-C Intel core2 4.35
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
+C K10: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
+C P4: 16.5
+C P6-15: 4.35
C This was written quickly and not optimized at all, but it runs very well on
C K8. But perhaps one could get under 3 c/l. Ideas:
@@ -53,17 +39,12 @@ define(`rp', `%rdi')
define(`up', `%rsi')
define(`vp', `%rdx')
define(`n', `%rcx')
-define(`cnt', `%r8')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`cnt' `%r8')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshsub_n)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
push %r12
push %r13
@@ -72,32 +53,32 @@ IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
mov n, %rax
- xor R32(%rbx), R32(%rbx) C clear carry save register
- mov R32(%r8), R32(%rcx) C shift count
- xor R32(%r15), R32(%r15) C limb carry
+ xor %ebx, %ebx C clear carry save register
+ mov %r8d, %ecx C shift count
+ xor %r15d, %r15d C limb carry
- mov R32(%rax), R32(%r11)
- and $3, R32(%r11)
+ mov %eax, %r11d
+ and $3, %r11d
je L(4)
- sub $1, R32(%r11)
+ sub $1, %r11d
L(oopette):
- add R32(%rbx), R32(%rbx) C restore carry flag
+ add %ebx, %ebx C restore carry flag
mov 0(up), %r8
lea 8(up), up
sbb 0(vp), %r8
mov %r8, %r12
- sbb R32(%rbx), R32(%rbx) C save carry flag
- shl R8(%rcx), %r8
+ sbb %ebx, %ebx C save carry flag
+ shl %cl, %r8
or %r15, %r8
mov %r12, %r15
lea 8(vp), vp
- neg R8(%rcx)
- shr R8(%rcx), %r15
- neg R8(%rcx)
+ neg %cl
+ shr %cl, %r15
+ neg %cl
mov %r8, 0(rp)
lea 8(rp), rp
- sub $1, R32(%r11)
+ sub $1, %r11d
jnc L(oopette)
L(4):
@@ -106,7 +87,7 @@ L(4):
ALIGN(16)
L(oop):
- add R32(%rbx), R32(%rbx) C restore carry flag
+ add %ebx, %ebx C restore carry flag
mov 0(up), %r8
mov 8(up), %r9
@@ -123,29 +104,29 @@ L(oop):
mov %r10, %r14
sbb 24(vp), %r11
- sbb R32(%rbx), R32(%rbx) C save carry flag
+ sbb %ebx, %ebx C save carry flag
- shl R8(%rcx), %r8
- shl R8(%rcx), %r9
- shl R8(%rcx), %r10
+ shl %cl, %r8
+ shl %cl, %r9
+ shl %cl, %r10
or %r15, %r8
mov %r11, %r15
- shl R8(%rcx), %r11
+ shl %cl, %r11
lea 32(vp), vp
- neg R8(%rcx)
+ neg %cl
- shr R8(%rcx), %r12
- shr R8(%rcx), %r13
- shr R8(%rcx), %r14
- shr R8(%rcx), %r15 C used next loop
+ shr %cl, %r12
+ shr %cl, %r13
+ shr %cl, %r14
+ shr %cl, %r15 C used next loop
or %r12, %r9
or %r13, %r10
or %r14, %r11
- neg R8(%rcx)
+ neg %cl
mov %r8, 0(rp)
mov %r9, 8(rp)
@@ -157,8 +138,8 @@ L(oop):
sub $4, %rax
jnc L(oop)
L(end):
- neg R32(%rbx)
- shl R8(%rcx), %rbx
+ neg %ebx
+ shl %cl, %rbx
adc %r15, %rbx
mov %rbx, %rax
pop %rbx
@@ -167,6 +148,5 @@ L(end):
pop %r13
pop %r12
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/missing-call.m4 b/gmp/mpn/x86_64/missing-call.m4
deleted file mode 100644
index c024f0ed77..0000000000
--- a/gmp/mpn/x86_64/missing-call.m4
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl AMD64 MULX/ADX simulation support, function call version.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-define(`adox',`
- push $1
- push $2
- call __gmp_adox
- pop $2
-')
-
-define(`adcx',`
- push $1
- push $2
- call __gmp_adcx
- pop $2
-')
-
-define(`mulx',`
- push $1
- call __gmp_mulx
- pop $2
- pop $3
-')
diff --git a/gmp/mpn/x86_64/missing-inline.m4 b/gmp/mpn/x86_64/missing-inline.m4
deleted file mode 100644
index bd1df1313f..0000000000
--- a/gmp/mpn/x86_64/missing-inline.m4
+++ /dev/null
@@ -1,100 +0,0 @@
-dnl AMD64 MULX/ADX simulation support, inline version.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-define(`adox',`
- push $2
- push %rcx
- push %rbx
- push %rax
- mov $1, %rcx
- pushfq
- pushfq
-C copy 0(%rsp):11 to 0(%rsp):0
- mov (%rsp), %rbx
- shr %rbx
- bt $`'10, %rbx
- adc %rbx, %rbx
- mov %rbx, (%rsp)
-C put manipulated flags into eflags, execute a plain adc
- popfq
- adc %rcx, 32(%rsp)
-C copy CF to 0(%rsp):11
- mov (%rsp), %rbx
- sbb R32(%rax), R32(%rax)
- and $`'0x800, R32(%rax)
- and $`'0xfffffffffffff7ff, %rbx
- or %rax, %rbx
- mov %rbx, (%rsp)
-C put manipulated flags into eflags
- popfq
- pop %rax
- pop %rbx
- pop %rcx
- pop $2
-')
-
-define(`adcx',`
- push $2
- push %rcx
- push %rbx
- push %rax
- mov $1, %rcx
- pushfq
- adc %rcx, 32(%rsp)
- mov (%rsp), %rbx
- sbb R32(%rax), R32(%rax)
- and $`'0xfffffffffffffffe, %rbx
- sub %rax, %rbx
- mov %rbx, (%rsp)
- popfq
- pop %rax
- pop %rbx
- pop %rcx
- pop $2
-')
-
-define(`mulx',`
- lea -16(%rsp), %rsp
- push %rax
- push %rdx
- pushfq C preserve all flags
- mov $1, %rax
- mul %rdx
- mov %rax, 24(%rsp)
- mov %rdx, 32(%rsp)
- popfq C restore eflags
- pop %rdx
- pop %rax
- pop $2
- pop $3
-')
diff --git a/gmp/mpn/x86_64/missing.asm b/gmp/mpn/x86_64/missing.asm
deleted file mode 100644
index 9b65c89dd4..0000000000
--- a/gmp/mpn/x86_64/missing.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-
- dnl AMD64 MULX/ADX simulation support.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-ASM_START()
-
-C Fake the MULX instruction
-C
-C Accept the single explicit parameter on the stack, return the two result
-C words on the stack. This calling convention means that we need to move the
-C return address up.
-C
-PROLOGUE(__gmp_mulx)
- lea -8(%rsp), %rsp
- push %rax
- push %rdx
- pushfq C preserve all flags
- mov 32(%rsp), %rax C move retaddr...
- mov %rax, 24(%rsp) C ...up the stack
- mov 40(%rsp), %rax C input parameter
- mul %rdx
- mov %rax, 32(%rsp)
- mov %rdx, 40(%rsp)
- popfq C restore eflags
- pop %rdx
- pop %rax
- ret
-EPILOGUE()
-PROTECT(__gmp_mulx)
-
-
-C Fake the ADOX instruction
-C
-C Accept the two parameters on the stack, return the result word on the stack.
-C This calling convention means that we need to move the return address down.
-C
-PROLOGUE(__gmp_adox)
- push %rcx
- push %rbx
- push %rax
- mov 32(%rsp), %rcx C src2
- mov 24(%rsp), %rax C move retaddr...
- mov %rax, 32(%rsp) C ...down the stack
- pushfq
-C copy 0(%rsp):11 to 0(%rsp):0
- mov (%rsp), %rbx
- shr %rbx
- bt $10, %rbx
- adc %rbx, %rbx
- push %rbx
-C put manipulated flags into eflags, execute a plain adc
- popfq
- adc %rcx, 48(%rsp)
-C copy CF to 0(%rsp):11
- pop %rbx
- sbb R32(%rax), R32(%rax)
- and $0x800, R32(%rax)
- and $0xfffffffffffff7ff, %rbx
- or %rax, %rbx
- push %rbx
-C put manipulated flags into eflags
- popfq
- pop %rax
- pop %rbx
- pop %rcx
- lea 8(%rsp), %rsp
- ret
-EPILOGUE()
-PROTECT(__gmp_adox)
-
-
-C Fake the ADCX instruction
-C
-C Accept the two parameters on the stack, return the result word on the stack.
-C This calling convention means that we need to move the return address down.
-C
-PROLOGUE(__gmp_adcx)
- push %rcx
- push %rbx
- push %rax
- mov 32(%rsp), %rcx C src2
- mov 24(%rsp), %rax C move retaddr...
- mov %rax, 32(%rsp) C ...down the stack
- pushfq
- adc %rcx, 48(%rsp)
- pop %rbx
- sbb R32(%rax), R32(%rax)
- and $`'0xfffffffffffffffe, %rbx
- sub %rax, %rbx
- push %rbx
- popfq
- pop %rax
- pop %rbx
- pop %rcx
- lea 8(%rsp), %rsp
- ret
-EPILOGUE()
-PROTECT(__gmp_adcx)
diff --git a/gmp/mpn/x86_64/mod_1_1.asm b/gmp/mpn/x86_64/mod_1_1.asm
deleted file mode 100644
index 4a7c45a58b..0000000000
--- a/gmp/mpn/x86_64/mod_1_1.asm
+++ /dev/null
@@ -1,235 +0,0 @@
-dnl AMD64 mpn_mod_1_1p
-
-dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
-
-dnl Copyright 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 6
-C AMD K10 6
-C Intel P4 26
-C Intel core2 12.5
-C Intel NHM 11.3
-C Intel SBR 8.4 (slowdown, old code took 8.0)
-C Intel atom 26
-C VIA nano 13
-
-define(`B2mb', `%r10')
-define(`B2modb', `%r11')
-define(`ap', `%rdi')
-define(`n', `%rsi')
-define(`pre', `%r8')
-define(`b', `%rbx')
-
-define(`r0', `%rbp') C r1 kept in %rax
-define(`r2', `%rcx') C kept negated. Also used as shift count
-define(`t0', `%r9')
-
-C mp_limb_t
-C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
-C %rdi %rsi %rdx %rcx
-C The pre array contains bi, cnt, B1modb, B2modb
-C Note: This implementation needs B1modb only when cnt > 0
-
-C The iteration is almost as follows,
-C
-C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
-C
-C where r2 is a single bit represented as a mask. But to make sure that the
-C result fits in two limbs and a bit, carry from the addition
-C
-C r_0 + r_2 B2mod
-C
-C is handled specially. On carry, we subtract b to cancel the carry,
-C and we use instead the value
-C
-C r_0 + B2mb (mod B)
-C
-C This addition can be issued early since it doesn't depend on r2, and it is
-C the source of the cmov in the loop.
-C
-C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1_1p)
- FUNC_ENTRY(4)
- push %rbp
- push %rbx
- mov %rdx, b
- mov %rcx, pre
-
- mov -8(ap, n, 8), %rax
- cmp $3, n
- jnc L(first)
- mov -16(ap, n, 8), r0
- jmp L(reduce_two)
-
-L(first):
- C First iteration, no r2
- mov 24(pre), B2modb
- mul B2modb
- mov -24(ap, n, 8), r0
- add %rax, r0
- mov -16(ap, n, 8), %rax
- adc %rdx, %rax
- sbb r2, r2
- sub $4, n
- jc L(reduce_three)
-
- mov B2modb, B2mb
- sub b, B2mb
-
- ALIGN(16)
-L(top): and B2modb, r2
- lea (B2mb, r0), t0
- mul B2modb
- add r0, r2
- mov (ap, n, 8), r0
- cmovc t0, r2
- add %rax, r0
- mov r2, %rax
- adc %rdx, %rax
- sbb r2, r2
- sub $1, n
- jnc L(top)
-
-L(reduce_three):
- C Eliminate r2
- and b, r2
- sub r2, %rax
-
-L(reduce_two):
- mov 8(pre), R32(%rcx)
- test R32(%rcx), R32(%rcx)
- jz L(normalized)
-
- C Unnormalized, use B1modb to reduce to size < B (b+1)
- mulq 16(pre)
- xor t0, t0
- add %rax, r0
- adc %rdx, t0
- mov t0, %rax
-
- C Left-shift to normalize
-ifdef(`SHLD_SLOW',`
- shl R8(%rcx), %rax
- mov r0, t0
- neg R32(%rcx)
- shr R8(%rcx), t0
- or t0, %rax
- neg R32(%rcx)
-',`
- shld R8(%rcx), r0, %rax
-')
- shl R8(%rcx), r0
- jmp L(udiv)
-
-L(normalized):
- mov %rax, t0
- sub b, t0
- cmovnc t0, %rax
-
-L(udiv):
- lea 1(%rax), t0
- mulq (pre)
- add r0, %rax
- adc t0, %rdx
- imul b, %rdx
- sub %rdx, r0
- cmp r0, %rax
- lea (b, r0), %rax
- cmovnc r0, %rax
- cmp b, %rax
- jnc L(fix)
-L(ok): shr R8(%rcx), %rax
-
- pop %rbx
- pop %rbp
- FUNC_EXIT()
- ret
-L(fix): sub b, %rax
- jmp L(ok)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1_1p_cps)
- FUNC_ENTRY(2)
- push %rbp
- bsr %rsi, %rcx
- push %rbx
- mov %rdi, %rbx
- push %r12
- xor $63, R32(%rcx)
- mov %rsi, %r12
- mov R32(%rcx), R32(%rbp)
- sal R8(%rcx), %r12
-IFSTD(` mov %r12, %rdi ') C pass parameter
-IFDOS(` mov %r12, %rcx ') C pass parameter
- CALL( mpn_invert_limb)
- neg %r12
- mov %r12, %r8
- mov %rax, (%rbx) C store bi
- mov %rbp, 8(%rbx) C store cnt
- imul %rax, %r12
- mov %r12, 24(%rbx) C store B2modb
- mov R32(%rbp), R32(%rcx)
- test R32(%rcx), R32(%rcx)
- jz L(z)
-
- mov $1, R32(%rdx)
-ifdef(`SHLD_SLOW',`
- C Destroys %rax, unlike shld. Otherwise, we could do B1modb
- C before B2modb, and get rid of the move %r12, %r8 above.
-
- shl R8(%rcx), %rdx
- neg R32(%rcx)
- shr R8(%rcx), %rax
- or %rax, %rdx
- neg R32(%rcx)
-',`
- shld R8(%rcx), %rax, %rdx
-')
- imul %rdx, %r8
- shr R8(%rcx), %r8
- mov %r8, 16(%rbx) C store B1modb
-L(z):
- pop %r12
- pop %rbx
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/mod_1_2.asm b/gmp/mpn/x86_64/mod_1_2.asm
deleted file mode 100644
index 02dd917791..0000000000
--- a/gmp/mpn/x86_64/mod_1_2.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl AMD64 mpn_mod_1s_2p
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 4
-C AMD K10 4
-C Intel P4 19
-C Intel core2 8
-C Intel NHM 6.5
-C Intel SBR 4.5
-C Intel atom 28
-C VIA nano 8
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_2p)
- FUNC_ENTRY(4)
- push %r14
- test $1, R8(%rsi)
- mov %rdx, %r14
- push %r13
- mov %rcx, %r13
- push %r12
- push %rbp
- push %rbx
- mov 16(%rcx), %r10
- mov 24(%rcx), %rbx
- mov 32(%rcx), %rbp
- je L(b0)
- dec %rsi
- je L(one)
- mov -8(%rdi,%rsi,8), %rax
- mul %r10
- mov %rax, %r9
- mov %rdx, %r8
- mov (%rdi,%rsi,8), %rax
- add -16(%rdi,%rsi,8), %r9
- adc $0, %r8
- mul %rbx
- add %rax, %r9
- adc %rdx, %r8
- jmp L(11)
-
-L(b0): mov -8(%rdi,%rsi,8), %r8
- mov -16(%rdi,%rsi,8), %r9
-
-L(11): sub $4, %rsi
- jb L(ed2)
- lea 40(%rdi,%rsi,8), %rdi
- mov -40(%rdi), %r11
- mov -32(%rdi), %rax
- jmp L(m0)
-
- ALIGN(16)
-L(top): mov -24(%rdi), %r9
- add %rax, %r11
- mov -16(%rdi), %rax
- adc %rdx, %r12
- mul %r10
- add %rax, %r9
- mov %r11, %rax
- mov %rdx, %r8
- adc $0, %r8
- mul %rbx
- add %rax, %r9
- mov %r12, %rax
- adc %rdx, %r8
- mul %rbp
- sub $2, %rsi
- jb L(ed1)
- mov -40(%rdi), %r11
- add %rax, %r9
- mov -32(%rdi), %rax
- adc %rdx, %r8
-L(m0): mul %r10
- add %rax, %r11
- mov %r9, %rax
- mov %rdx, %r12
- adc $0, %r12
- mul %rbx
- add %rax, %r11
- lea -32(%rdi), %rdi C ap -= 4
- mov %r8, %rax
- adc %rdx, %r12
- mul %rbp
- sub $2, %rsi
- jae L(top)
-
-L(ed0): mov %r11, %r9
- mov %r12, %r8
-L(ed1): add %rax, %r9
- adc %rdx, %r8
-L(ed2): mov 8(%r13), R32(%rdi) C cnt
- mov %r8, %rax
- mov %r9, %r8
- mul %r10
- add %rax, %r8
- adc $0, %rdx
-L(1): xor R32(%rcx), R32(%rcx)
- mov %r8, %r9
- sub R32(%rdi), R32(%rcx)
- shr R8(%rcx), %r9
- mov R32(%rdi), R32(%rcx)
- sal R8(%rcx), %rdx
- or %rdx, %r9
- sal R8(%rcx), %r8
- mov %r9, %rax
- mulq (%r13)
- mov %rax, %rsi
- inc %r9
- add %r8, %rsi
- adc %r9, %rdx
- imul %r14, %rdx
- sub %rdx, %r8
- lea (%r8,%r14), %rax
- cmp %r8, %rsi
- cmovc %rax, %r8
- mov %r8, %rax
- sub %r14, %rax
- cmovc %r8, %rax
- mov R32(%rdi), R32(%rcx)
- shr R8(%rcx), %rax
- pop %rbx
- pop %rbp
- pop %r12
- pop %r13
- pop %r14
- FUNC_EXIT()
- ret
-L(one):
- mov (%rdi), %r8
- mov 8(%rcx), R32(%rdi)
- xor %rdx, %rdx
- jmp L(1)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_2p_cps)
- FUNC_ENTRY(2)
- push %rbp
- bsr %rsi, %rcx
- push %rbx
- mov %rdi, %rbx
- push %r12
- xor $63, R32(%rcx)
- mov %rsi, %r12
- mov R32(%rcx), R32(%rbp) C preserve cnt over call
- sal R8(%rcx), %r12 C b << cnt
-IFSTD(` mov %r12, %rdi ') C pass parameter
-IFDOS(` mov %r12, %rcx ') C pass parameter
- CALL( mpn_invert_limb)
- mov %r12, %r8
- mov %rax, %r11
- mov %rax, (%rbx) C store bi
- mov %rbp, 8(%rbx) C store cnt
- neg %r8
- mov R32(%rbp), R32(%rcx)
- mov $1, R32(%rsi)
-ifdef(`SHLD_SLOW',`
- shl R8(%rcx), %rsi
- neg R32(%rcx)
- mov %rax, %rbp
- shr R8(%rcx), %rax
- or %rax, %rsi
- mov %rbp, %rax
- neg R32(%rcx)
-',`
- shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
-')
- imul %r8, %rsi
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 16(%rbx) C store B1modb
-
- not %rdx
- imul %r12, %rdx
- lea (%rdx,%r12), %rsi
- cmp %rdx, %rax
- cmovnc %rdx, %rsi
- mov %r11, %rax
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 24(%rbx) C store B2modb
-
- not %rdx
- imul %r12, %rdx
- add %rdx, %r12
- cmp %rdx, %rax
- cmovnc %rdx, %r12
-
- shr R8(%rcx), %r12
- mov %r12, 32(%rbx) C store B3modb
-
- pop %r12
- pop %rbx
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/mod_1_4.asm b/gmp/mpn/x86_64/mod_1_4.asm
deleted file mode 100644
index 3ce83dc42e..0000000000
--- a/gmp/mpn/x86_64/mod_1_4.asm
+++ /dev/null
@@ -1,269 +0,0 @@
-dnl AMD64 mpn_mod_1s_4p
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2009-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 3
-C AMD K10 3
-C Intel P4 15.5
-C Intel core2 5
-C Intel corei 4
-C Intel atom 23
-C VIA nano 4.75
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p)
- FUNC_ENTRY(4)
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbp
- push %rbx
-
- mov %rdx, %r15
- mov %rcx, %r14
- mov 16(%rcx), %r11 C B1modb
- mov 24(%rcx), %rbx C B2modb
- mov 32(%rcx), %rbp C B3modb
- mov 40(%rcx), %r13 C B4modb
- mov 48(%rcx), %r12 C B5modb
- xor R32(%r8), R32(%r8)
- mov R32(%rsi), R32(%rdx)
- and $3, R32(%rdx)
- je L(b0)
- cmp $2, R32(%rdx)
- jc L(b1)
- je L(b2)
-
-L(b3): lea -24(%rdi,%rsi,8), %rdi
- mov 8(%rdi), %rax
- mul %r11
- mov (%rdi), %r9
- add %rax, %r9
- adc %rdx, %r8
- mov 16(%rdi), %rax
- mul %rbx
- jmp L(m0)
-
- ALIGN(8)
-L(b0): lea -32(%rdi,%rsi,8), %rdi
- mov 8(%rdi), %rax
- mul %r11
- mov (%rdi), %r9
- add %rax, %r9
- adc %rdx, %r8
- mov 16(%rdi), %rax
- mul %rbx
- add %rax, %r9
- adc %rdx, %r8
- mov 24(%rdi), %rax
- mul %rbp
- jmp L(m0)
-
- ALIGN(8)
-L(b1): lea -8(%rdi,%rsi,8), %rdi
- mov (%rdi), %r9
- jmp L(m1)
-
- ALIGN(8)
-L(b2): lea -16(%rdi,%rsi,8), %rdi
- mov 8(%rdi), %r8
- mov (%rdi), %r9
- jmp L(m1)
-
- ALIGN(16)
-L(top): mov -24(%rdi), %rax
- mov -32(%rdi), %r10
- mul %r11 C up[1] * B1modb
- add %rax, %r10
- mov -16(%rdi), %rax
- mov $0, R32(%rcx)
- adc %rdx, %rcx
- mul %rbx C up[2] * B2modb
- add %rax, %r10
- mov -8(%rdi), %rax
- adc %rdx, %rcx
- sub $32, %rdi
- mul %rbp C up[3] * B3modb
- add %rax, %r10
- mov %r13, %rax
- adc %rdx, %rcx
- mul %r9 C rl * B4modb
- add %rax, %r10
- mov %r12, %rax
- adc %rdx, %rcx
- mul %r8 C rh * B5modb
- mov %r10, %r9
- mov %rcx, %r8
-L(m0): add %rax, %r9
- adc %rdx, %r8
-L(m1): sub $4, %rsi
- ja L(top)
-
-L(end): mov 8(%r14), R32(%rsi)
- mov %r8, %rax
- mul %r11
- mov %rax, %r8
- add %r9, %r8
- adc $0, %rdx
- xor R32(%rcx), R32(%rcx)
- sub R32(%rsi), R32(%rcx)
- mov %r8, %rdi
- shr R8(%rcx), %rdi
- mov R32(%rsi), R32(%rcx)
- sal R8(%rcx), %rdx
- or %rdx, %rdi
- mov %rdi, %rax
- mulq (%r14)
- mov %r15, %rbx
- mov %rax, %r9
- sal R8(%rcx), %r8
- inc %rdi
- add %r8, %r9
- adc %rdi, %rdx
- imul %rbx, %rdx
- sub %rdx, %r8
- lea (%r8,%rbx), %rax
- cmp %r8, %r9
- cmovc %rax, %r8
- mov %r8, %rax
- sub %rbx, %rax
- cmovc %r8, %rax
- shr R8(%rcx), %rax
- pop %rbx
- pop %rbp
- pop %r12
- pop %r13
- pop %r14
- pop %r15
- FUNC_EXIT()
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1s_4p_cps)
- FUNC_ENTRY(2)
- push %rbp
- bsr %rsi, %rcx
- push %rbx
- mov %rdi, %rbx
- push %r12
- xor $63, R32(%rcx)
- mov %rsi, %r12
- mov R32(%rcx), R32(%rbp) C preserve cnt over call
- sal R8(%rcx), %r12 C b << cnt
-IFSTD(` mov %r12, %rdi ') C pass parameter
-IFDOS(` mov %r12, %rcx ') C pass parameter
- CALL( mpn_invert_limb)
- mov %r12, %r8
- mov %rax, %r11
- mov %rax, (%rbx) C store bi
- mov %rbp, 8(%rbx) C store cnt
- neg %r8
- mov R32(%rbp), R32(%rcx)
- mov $1, R32(%rsi)
-ifdef(`SHLD_SLOW',`
- shl R8(%rcx), %rsi
- neg R32(%rcx)
- mov %rax, %rbp
- shr R8(%rcx), %rax
- or %rax, %rsi
- mov %rbp, %rax
- neg R32(%rcx)
-',`
- shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
-')
- imul %r8, %rsi
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 16(%rbx) C store B1modb
-
- not %rdx
- imul %r12, %rdx
- lea (%rdx,%r12), %rsi
- cmp %rdx, %rax
- cmovnc %rdx, %rsi
- mov %r11, %rax
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 24(%rbx) C store B2modb
-
- not %rdx
- imul %r12, %rdx
- lea (%rdx,%r12), %rsi
- cmp %rdx, %rax
- cmovnc %rdx, %rsi
- mov %r11, %rax
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 32(%rbx) C store B3modb
-
- not %rdx
- imul %r12, %rdx
- lea (%rdx,%r12), %rsi
- cmp %rdx, %rax
- cmovnc %rdx, %rsi
- mov %r11, %rax
- mul %rsi
-
- add %rsi, %rdx
- shr R8(%rcx), %rsi
- mov %rsi, 40(%rbx) C store B4modb
-
- not %rdx
- imul %r12, %rdx
- add %rdx, %r12
- cmp %rdx, %rax
- cmovnc %rdx, %r12
-
- shr R8(%rcx), %r12
- mov %r12, 48(%rbx) C store B5modb
-
- pop %r12
- pop %rbx
- pop %rbp
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/mod_34lsub1.asm b/gmp/mpn/x86_64/mod_34lsub1.asm
index 62bdcfac69..34df5bb5b7 100644
--- a/gmp/mpn/x86_64/mod_34lsub1.asm
+++ b/gmp/mpn/x86_64/mod_34lsub1.asm
@@ -1,194 +1,155 @@
dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
-dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
-C AMD K10 0.67 this seems hard to beat
-C AMD bd1 1
-C AMD bobcat 1.07
-C Intel P4 7.35 terrible, use old code
-C Intel core2 1.25 1+epsilon with huge unrolling
-C Intel NHM 1.15 this seems hard to beat
-C Intel SBR 0.93
-C Intel atom 2.5
-C VIA nano 1.25 this seems hard to beat
+C cycles/limb
+C K8,K9: 1.0
+C K10: 1.12
+C P4: 3.25
+C P6-15 (Core2): 1.5
+C P6-28 (Atom): 2.5
+
C INPUT PARAMETERS
-define(`ap', %rdi)
-define(`n', %rsi)
+C up rdi
+C n rsi
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
C TODO
-C * Review feed-in and wind-down code.
+C * Apply the movzwl tricks to the x86/k7 code
+C * Review feed-in and wind-down code. In particular, try to avoid adcq and
+C sbbq to placate Pentium4.
+C * More unrolling and/or index addressing could bring time to under 1 c/l
+C for Athlon64, approaching 0.67 c/l seems possible.
+C * There are recurrencies on the carry registers (r8, r9, r10) that might
+C be the limiting factor for the Pentium4 speed. Splitting these into 6
+C registers would help.
+C * For ultimate Athlon64 performance, a sequence like this might be best.
+C It should reach 0.5 c/l (limited by L1 cache bandwidth).
+C
+C addq (%rdi), %rax
+C adcq 8(%rdi), %rcx
+C adcq 16(%rdi), %rdx
+C adcq $0, %r8
+C addq 24(%rdi), %rax
+C adcq 32(%rdi), %rcx
+C adcq 40(%rdi), %rdx
+C adcq $0, %r8
+C ...
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
- FUNC_ENTRY(2)
mov $0x0000FFFFFFFFFFFF, %r11
- mov (ap), %rax
-
- cmp $2, %rsi
+ sub $2, %rsi
ja L(gt2)
- jb L(one)
+ mov (%rdi), %rax
+ nop
+ jb L(1)
- mov 8(ap), %rsi
+ mov 8(%rdi), %rsi
mov %rax, %rdx
shr $48, %rax C src[0] low
and %r11, %rdx C src[0] high
add %rdx, %rax
- mov R32(%rsi), R32(%rdx)
+ mov %esi, %edx
shr $32, %rsi C src[1] high
add %rsi, %rax
shl $16, %rdx C src[1] low
add %rdx, %rax
-L(one): FUNC_EXIT()
- ret
+L(1): ret
-C Don't change this, the wind-down code is not able to handle greater values
-define(UNROLL,3)
-L(gt2): mov 8(ap), %rcx
- mov 16(ap), %rdx
- xor %r9, %r9
- add $24, ap
- sub $eval(UNROLL*3+3), %rsi
- jc L(end)
ALIGN(16)
-L(top):
- add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
- adc $0, %r9
-forloop(i,1,UNROLL-1,`dnl
- add eval(i*24)(ap), %rax
- adc eval(i*24+8)(ap), %rcx
- adc eval(i*24+16)(ap), %rdx
- adc $0, %r9
-')dnl
- add $eval(UNROLL*24), ap
- sub $eval(UNROLL*3), %rsi
- jnc L(top)
-
-L(end):
- lea L(tab)(%rip), %r8
-ifdef(`PIC',
-` movslq 36(%r8,%rsi,4), %r10
- add %r10, %r8
- jmp *%r8
-',`
- jmp *72(%r8,%rsi,8)
-')
- JUMPTABSECT
- ALIGN(8)
-L(tab): JMPENT( L(0), L(tab))
- JMPENT( L(1), L(tab))
- JMPENT( L(2), L(tab))
- JMPENT( L(3), L(tab))
- JMPENT( L(4), L(tab))
- JMPENT( L(5), L(tab))
- JMPENT( L(6), L(tab))
- JMPENT( L(7), L(tab))
- JMPENT( L(8), L(tab))
- TEXT
+L(gt2): xor %eax, %eax
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %r8, %r8
+ xor %r9, %r9
+ xor %r10, %r10
-L(6): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
- adc $0, %r9
- add $24, ap
-L(3): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
- jmp L(cj1)
-
-L(7): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
+L(top): add (%rdi), %rax
+ adc $0, %r10
+ add 8(%rdi), %rcx
+ adc $0, %r8
+ add 16(%rdi), %rdx
adc $0, %r9
- add $24, ap
-L(4): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
- adc $0, %r9
- add $24, ap
-L(1): add (ap), %rax
- adc $0, %rcx
- jmp L(cj2)
-
-L(8): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
- adc $0, %r9
- add $24, ap
-L(5): add (ap), %rax
- adc 8(ap), %rcx
- adc 16(ap), %rdx
+
+ sub $3,%rsi
+ jng L(end)
+
+ add 24(%rdi), %rax
+ adc $0, %r10
+ add 32(%rdi), %rcx
+ adc $0, %r8
+ add 40(%rdi), %rdx
+ lea 48(%rdi), %rdi
adc $0, %r9
- add $24, ap
-L(2): add (ap), %rax
- adc 8(ap), %rcx
-L(cj2): adc $0, %rdx
-L(cj1): adc $0, %r9
-L(0): add %r9, %rax
- adc $0, %rcx
- adc $0, %rdx
- adc $0, %rax
+ sub $3,%rsi
+ jg L(top)
+
+
+ add $-24, %rdi
+L(end): add %r9, %rax
+ adc %r10, %rcx
+ adc %r8, %rdx
+
+ inc %rsi
+ mov $0x1, %r10d
+ js L(combine)
+
+ mov $0x10000, %r10d
+ adc 24(%rdi), %rax
+ dec %rsi
+ js L(combine)
+
+ adc 32(%rdi), %rcx
+ mov $0x100000000, %r10
+L(combine):
+ sbb %rsi, %rsi C carry
mov %rax, %rdi C 0mod3
shr $48, %rax C 0mod3 high
+ and %r10, %rsi C carry masked
and %r11, %rdi C 0mod3 low
- mov R32(%rcx), R32(%r10) C 1mod3
+ mov %ecx, %r10d C 1mod3
+ add %rsi, %rax C apply carry
shr $32, %rcx C 1mod3 high
add %rdi, %rax C apply 0mod3 low
- movzwl %dx, R32(%rdi) C 2mod3
+ movzwl %dx, %edi C 2mod3
shl $16, %r10 C 1mod3 low
add %rcx, %rax C apply 1mod3 high
@@ -200,6 +161,5 @@ L(0): add %r9, %rax
add %rdx, %rax C apply 2mod3 high
add %rdi, %rax C apply 2mod3 low
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/mode1o.asm b/gmp/mpn/x86_64/mode1o.asm
index 2cd2b08848..c5f2bc7990 100644
--- a/gmp/mpn/x86_64/mode1o.asm
+++ b/gmp/mpn/x86_64/mode1o.asm
@@ -1,131 +1,139 @@
-dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
+dnl AMD64 mpn_modexact_1_odd -- exact division style remainder.
-dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software
+dnl Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 10
-C AMD K10 10
-C Intel P4 33
-C Intel core2 13
-C Intel corei 14.5
-C Intel atom 35
-C VIA nano ?
+C K8,K9: 10
+C K10: 10
+C P4: 33
+C P6-15 (Core2):13
+C P6-28 (Atom): 35
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C
C The dependent chain in the main loop is
C
C cycles
-C sub %rdx, %rax 1
-C imul %r9, %rax 4
-C mul %r8 5
+C subq %rdx, %rax 1
+C imulq %r9, %rax 4
+C mulq %r8 5
C ----
C total 10
C
-C The mov load from src seems to need to be scheduled back before the jz to
-C achieve this speed, out-of-order execution apparently can't completely hide
-C the latency otherwise.
+C The movq load from src seems to need to be scheduled back before the jz to
+C achieve this speed, out-of-order execution apparently can't completely
+C hide the latency otherwise.
C
-C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
-C for the first iteration (where there's no cbit).
+C The l=src[i]-cbit step is rotated back too, since that allows us to avoid
+C it for the first iteration (where there's no cbit).
C
-C The code alignment used (32-byte) for the loop also seems necessary. Without
-C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
-C run at 11 cycles instead of 10.
-
+C The code alignment used (32-byte) for the loop also seems necessary.
+C Without that the non-PIC case has adcq crossing the 0x60 offset,
+C apparently making it run at 11 cycles instead of 10.
+C
+C Not done:
+C
+C divq for size==1 was measured at about 79 cycles, compared to the inverse
+C at about 25 cycles (both including function call overheads), so that's not
+C used.
+C
+C Enhancements:
+C
+C For PIC, we shouldn't really need the GOT fetch for binvert_limb_table,
+C it'll be in rodata or text in libgmp.so and can be accessed directly %rip
+C relative. This would be for small model only (something we don't
+C presently detect, but which is all that gcc 3.3.3 supports), since 8-byte
+C PC-relative relocations are apparently not available. Some rough
+C experiments with binutils 2.13 looked worrylingly like it might come out
+C with an unwanted text segment relocation though, even with ".protected".
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_modexact_1_odd)
- FUNC_ENTRY(3)
- mov $0, R32(%rcx)
-IFDOS(` jmp L(ent) ')
+
+ movl $0, %ecx
PROLOGUE(mpn_modexact_1c_odd)
- FUNC_ENTRY(4)
-L(ent):
+
C rdi src
C rsi size
C rdx divisor
C rcx carry
- mov %rdx, %r8 C d
- shr R32(%rdx) C d/2
-
- LEA( binvert_limb_table, %r9)
+ movq %rdx, %r8 C d
+ shrl %edx C d/2
+ifdef(`PIC',`
+ movq binvert_limb_table@GOTPCREL(%rip), %r9
+',`
+ movabsq $binvert_limb_table, %r9
+')
- and $127, R32(%rdx)
- mov %rcx, %r10 C initial carry
+ andl $127, %edx
+ movq %rcx, %r10 C initial carry
- movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits
+ movzbl (%r9,%rdx), %edx C inv 8 bits
- mov (%rdi), %rax C src[0]
- lea (%rdi,%rsi,8), %r11 C src end
- mov %r8, %rdi C d, made available to imull
+ movq (%rdi), %rax C src[0]
+ leaq (%rdi,%rsi,8), %r11 C src end
+ movq %r8, %rdi C d, made available to imull
- lea (%rdx,%rdx), R32(%rcx) C 2*inv
- imul R32(%rdx), R32(%rdx) C inv*inv
+ leal (%rdx,%rdx), %ecx C 2*inv
+ imull %edx, %edx C inv*inv
- neg %rsi C -size
+ negq %rsi C -size
- imul R32(%rdi), R32(%rdx) C inv*inv*d
+ imull %edi, %edx C inv*inv*d
- sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits
+ subl %edx, %ecx C inv = 2*inv - inv*inv*d, 16 bits
- lea (%rcx,%rcx), R32(%rdx) C 2*inv
- imul R32(%rcx), R32(%rcx) C inv*inv
+ leal (%rcx,%rcx), %edx C 2*inv
+ imull %ecx, %ecx C inv*inv
- imul R32(%rdi), R32(%rcx) C inv*inv*d
+ imull %edi, %ecx C inv*inv*d
- sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits
- xor R32(%rcx), R32(%rcx) C initial cbit
+ subl %ecx, %edx C inv = 2*inv - inv*inv*d, 32 bits
+ xorl %ecx, %ecx C initial cbit
- lea (%rdx,%rdx), %r9 C 2*inv
- imul %rdx, %rdx C inv*inv
+ leaq (%rdx,%rdx), %r9 C 2*inv
+ imulq %rdx, %rdx C inv*inv
- imul %r8, %rdx C inv*inv*d
+ imulq %r8, %rdx C inv*inv*d
- sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits
- mov %r10, %rdx C initial climb
+ subq %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits
+ movq %r10, %rdx C initial climb
ASSERT(e,` C d*inv == 1 mod 2^64
- mov %r8, %r10
- imul %r9, %r10
- cmp $1, %r10')
+ movq %r8, %r10
+ imulq %r9, %r10
+ cmpq $1, %r10')
- inc %rsi
+ incq %rsi
jz L(one)
@@ -140,31 +148,30 @@ L(top):
C r9 inverse
C r11 src end ptr
- sub %rdx, %rax C l = src[i]-cbit - climb
+ subq %rdx, %rax C l = src[i]-cbit - climb
- adc $0, %rcx C more cbit
- imul %r9, %rax C q = l * inverse
+ adcq $0, %rcx C more cbit
+ imulq %r9, %rax C q = l * inverse
- mul %r8 C climb = high (q * d)
+ mulq %r8 C climb = high (q * d)
- mov (%r11,%rsi,8), %rax C src[i+1]
- sub %rcx, %rax C next l = src[i+1] - cbit
- setc R8(%rcx) C new cbit
+ movq (%r11,%rsi,8), %rax C src[i+1]
+ subq %rcx, %rax C next l = src[i+1] - cbit
+ setc %cl C new cbit
- inc %rsi
+ incq %rsi
jnz L(top)
L(one):
- sub %rdx, %rax C l = src[i]-cbit - climb
+ subq %rdx, %rax C l = src[i]-cbit - climb
- adc $0, %rcx C more cbit
- imul %r9, %rax C q = l * inverse
+ adcq $0, %rcx C more cbit
+ imulq %r9, %rax C q = l * inverse
- mul %r8 C climb = high (q * d)
+ mulq %r8 C climb = high (q * d)
- lea (%rcx,%rdx), %rax C climb+cbit
- FUNC_EXIT()
+ leaq (%rcx,%rdx), %rax C climb+cbit
ret
EPILOGUE(mpn_modexact_1c_odd)
diff --git a/gmp/mpn/x86_64/mul_1.asm b/gmp/mpn/x86_64/mul_1.asm
index b032afc9dd..da96a14c76 100644
--- a/gmp/mpn/x86_64/mul_1.asm
+++ b/gmp/mpn/x86_64/mul_1.asm
@@ -1,97 +1,64 @@
dnl AMD64 mpn_mul_1.
-dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.5
-C AMD K10 2.5
-C AMD bd1 5.0
-C AMD bobcat 5.5
-C Intel P4 12.3
-C Intel core2 4.0
-C Intel NHM 3.75
-C Intel SBR 2.95
-C Intel atom 19.8
-C VIA nano 4.25
-
-C The loop of this code is the result of running a code generation and
+C K8,K9: 2.5
+C K10: 2.5
+C P4: 12.3
+C P6-15: 4.0
+C P6-15 (Core2): 4.0
+C P6-28 (Atom): 19.8
+
+C The inner loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO
-C * The loop is great, but the prologue and epilogue code was quickly written.
-C Tune it!
+C TODO:
+C * The inner loop is great, but the prologue and epilogue code was
+C quickly written. Tune it!
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`vl', `%rcx') C r9
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vl', `%rcx')
-define(`n', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`vl', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
+define(`n', `%r11')
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_1c)
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
push %rbx
-IFSTD(` mov %r8, %r10')
-IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns)
+ mov %r8, %r10
jmp L(common)
EPILOGUE()
PROLOGUE(mpn_mul_1)
-IFDOS(``push %rsi '')
-IFDOS(``push %rdi '')
-IFDOS(``mov %rdx, %rsi '')
-
push %rbx
xor %r10, %r10
L(common):
mov (up), %rax C read first u limb early
-IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
-IFDOS(` mov n, %rbx ')
+ mov n_param, %rbx C move away n from rdx, mul uses it
mul vl
-IFSTD(` mov %rbx, n ')
+ mov %rbx, %r11
add %r10, %rax
adc $0, %rdx
@@ -146,7 +113,7 @@ L(top): mov %r10, (rp,n,8)
add %rax, %r9
mov (up,n,8), %rax
adc %rdx, %r8
- mov $0, R32(%r10)
+ mov $0, %r10d
L(L1): mul vl
mov %r9, 8(rp,n,8)
add %rax, %r8
@@ -159,11 +126,11 @@ L(L0): mov 8(up,n,8), %rax
L(L3): mov 16(up,n,8), %rax
mul vl
mov %rbx, 24(rp,n,8)
- mov $0, R32(%r8) C zero
- mov %r8, %rbx C zero
+ mov $0, %r8d # zero
+ mov %r8, %rbx # zero
add %rax, %r10
mov 24(up,n,8), %rax
- mov %r8, %r9 C zero
+ mov %r8, %r9 # zero
adc %rdx, %r9
L(L2): mul vl
add $4, n
@@ -177,7 +144,5 @@ L(L2): mul vl
L(ret): mov %rdx, %rax
pop %rbx
-IFDOS(``pop %rdi '')
-IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/mul_2.asm b/gmp/mpn/x86_64/mul_2.asm
index f408c52250..a8ad00069f 100644
--- a/gmp/mpn/x86_64/mul_2.asm
+++ b/gmp/mpn/x86_64/mul_2.asm
@@ -1,44 +1,30 @@
dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl store the result in a third limb vector.
-dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.275
-C AMD K10 2.275
-C Intel P4 13.5
-C Intel core2 4.0
-C Intel corei 3.8
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 2.275
+C K10: 2.275
+C P4: ?
+C P6-15: 4.0
C This code is the result of running a code generation and optimization tool
C suite written by David Harvey and Torbjorn Granlund.
@@ -64,14 +50,10 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_2)
- FUNC_ENTRY(4)
push %rbx
push %rbp
@@ -187,6 +169,5 @@ L(m22): mul v1
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/mul_basecase.asm b/gmp/mpn/x86_64/mul_basecase.asm
index ca2efb9b2f..09ec4d14ae 100644
--- a/gmp/mpn/x86_64/k8/mul_basecase.asm
+++ b/gmp/mpn/x86_64/mul_basecase.asm
@@ -2,44 +2,30 @@ dnl AMD64 mpn_mul_basecase.
dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
-dnl Copyright 2008, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.375
-C AMD K10 2.375
-C Intel P4 15-16
-C Intel core2 4.45
-C Intel corei 4.35
-C Intel atom ?
-C VIA nano 4.5
+C K8,K9: 2.375
+C K10: 2.375
+C P4: ?
+C P6-15: 4.45
C The inner loops of this code are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
@@ -47,6 +33,7 @@ C optimization tool suite written by David Harvey and Torbjorn Granlund.
C TODO
C * Use fewer registers. (how??? I can't see it -- david)
C * Avoid some "mov $0,r" and instead use "xor r,r".
+C * Don't align loops to a 32-byte boundaries.
C * Can the top of each L(addmul_outer_n) prologue be folded into the
C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
C case where vn = 1 or 2; is it worth it?
@@ -70,15 +57,10 @@ define(`n', `%r11')
define(`outer_addr', `%r14')
define(`un', `%r13')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_basecase)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
push %r12
@@ -111,13 +93,7 @@ L(mul_1):
cmp $2, R32(w0)
jc L(mul_1_prologue_1)
jz L(mul_1_prologue_2)
-
-L(mul_1_prologue_3):
- add $-1, n
- lea L(addmul_outer_3)(%rip), outer_addr
- mov %rax, w3
- mov %rdx, w0
- jmp L(mul_1_entry_3)
+ jmp L(mul_1_prologue_3)
L(mul_1_prologue_0):
mov %rax, w2
@@ -149,6 +125,13 @@ L(mul_1_prologue_2):
xor R32(w3), R32(w3)
jmp L(mul_1_entry_2)
+L(mul_1_prologue_3):
+ add $-1, n
+ lea L(addmul_outer_3)(%rip), outer_addr
+ mov %rax, w3
+ mov %rdx, w0
+ jmp L(mul_1_entry_3)
+
C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
@@ -309,7 +292,7 @@ L(mul_2_entry_1):
mov w3, -32(rp,n,8)
js L(mul_2_top)
- mov -32(up,n,8), %rax C FIXME: n is constant
+ mov -32(up,n,8), %rax
mul v1
add %rax, w0
mov w0, (rp)
@@ -463,7 +446,6 @@ L(ret): pop %r15
pop %r12
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/mulx/adx/addmul_1.asm b/gmp/mpn/x86_64/mulx/adx/addmul_1.asm
deleted file mode 100644
index ea607899a4..0000000000
--- a/gmp/mpn/x86_64/mulx/adx/addmul_1.asm
+++ /dev/null
@@ -1,149 +0,0 @@
-dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 -
-C AMD K10 -
-C AMD bd1 -
-C AMD bobcat -
-C Intel P4 -
-C Intel PNR -
-C Intel NHM -
-C Intel SBR -
-C Intel HWL -
-C Intel BWL ?
-C Intel atom -
-C VIA nano -
-
-define(`rp', `%rdi') dnl rcx
-define(`up', `%rsi') dnl rdx
-define(`n_param', `%rdx') dnl r8
-define(`v0_param',`%rcx') dnl r9
-
-define(`n', `%rcx') dnl
-define(`v0', `%rdx') dnl
-
-C Testing mechanism for running this on older AMD64 processors
-ifelse(FAKE_MULXADX,1,`
- include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
-',`
- define(`adox', ``adox' $1, $2')
- define(`adcx', ``adcx' $1, $2')
- define(`mulx', ``mulx' $1, $2, $3')
-')
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_addmul_1)
- mov (up), %r8
-
- push %rbx
- push %r12
- push %r13
-
- lea (up,n_param,8), up
- lea -16(rp,n_param,8), rp
- mov R32(n_param), R32(%rax)
- xchg v0_param, v0 C FIXME: is this insn fast?
-
- neg n
-
- and $3, R8(%rax)
- jz L(b0)
- cmp $2, R8(%rax)
- jl L(b1)
- jz L(b2)
-
-L(b3): mulx( (up,n,8), %r11, %r10)
- mulx( 8(up,n,8), %r13, %r12)
- mulx( 16(up,n,8), %rbx, %rax)
- dec n
- jmp L(lo3)
-
-L(b0): mulx( (up,n,8), %r9, %r8)
- mulx( 8(up,n,8), %r11, %r10)
- mulx( 16(up,n,8), %r13, %r12)
- jmp L(lo0)
-
-L(b2): mulx( (up,n,8), %r13, %r12)
- mulx( 8(up,n,8), %rbx, %rax)
- lea 2(n), n
- jrcxz L(wd2)
-L(gt2): mulx( (up,n,8), %r9, %r8)
- jmp L(lo2)
-
-L(b1): and R8(%rax), R8(%rax)
- mulx( (up,n,8), %rbx, %rax)
- lea 1(n), n
- jrcxz L(wd1)
- mulx( (up,n,8), %r9, %r8)
- mulx( 8(up,n,8), %r11, %r10)
- jmp L(lo1)
-
-L(end): adcx( %r10, %r13)
- mov %r11, -8(rp)
-L(wd2): adox( (rp), %r13)
- adcx( %r12, %rbx)
- mov %r13, (rp)
-L(wd1): adox( 8(rp), %rbx)
- adcx( %rcx, %rax)
- adox( %rcx, %rax)
- mov %rbx, 8(rp)
- pop %r13
- pop %r12
- pop %rbx
- ret
-
-L(top): jrcxz L(end)
- mulx( (up,n,8), %r9, %r8)
- adcx( %r10, %r13)
- mov %r11, -8(rp,n,8)
-L(lo2): adox( (rp,n,8), %r13)
- mulx( 8(up,n,8), %r11, %r10)
- adcx( %r12, %rbx)
- mov %r13, (rp,n,8)
-L(lo1): adox( 8(rp,n,8), %rbx)
- mulx( 16(up,n,8), %r13, %r12)
- adcx( %rax, %r9)
- mov %rbx, 8(rp,n,8)
-L(lo0): adox( 16(rp,n,8), %r9)
- mulx( 24(up,n,8), %rbx, %rax)
- adcx( %r8, %r11)
- mov %r9, 16(rp,n,8)
-L(lo3): adox( 24(rp,n,8), %r11)
- lea 4(n), n
- jmp L(top)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/mulx/aorsmul_1.asm b/gmp/mpn/x86_64/mulx/aorsmul_1.asm
deleted file mode 100644
index 285c07335e..0000000000
--- a/gmp/mpn/x86_64/mulx/aorsmul_1.asm
+++ /dev/null
@@ -1,161 +0,0 @@
-dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 -
-C AMD K10 -
-C AMD bd1 -
-C AMD bd2 ?
-C AMD bobcat -
-C AMD jaguar ?
-C Intel P4 -
-C Intel PNR -
-C Intel NHM -
-C Intel SBR -
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom -
-C VIA nano -
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rcx')
-define(`v0', `%rdx')
-
-ifdef(`OPERATION_addmul_1',`
- define(`ADDSUB', `add')
- define(`ADCSBB', `adc')
- define(`func', `mpn_addmul_1')
-')
-ifdef(`OPERATION_submul_1',`
- define(`ADDSUB', `sub')
- define(`ADCSBB', `sbb')
- define(`func', `mpn_submul_1')
-')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`vl', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- mov (up), %r8
-
- push %rbx
- push %r12
- push %r13
-
- lea (up,n_param,8), up
- lea -32(rp,n_param,8), rp
- mov R32(n_param), R32(%rax)
- xchg v0_param, v0 C FIXME: is this insn fast?
-
- neg n
-
- and $3, R8(%rax)
- jz L(b0)
- cmp $2, R8(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mulx %r8, %rbx, %rax
- sub $-1, n
- jz L(wd1)
- mulx (up,n,8), %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- test R32(%rax), R32(%rax) C clear cy
- jmp L(lo1)
-
-L(b0): mulx %r8, %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- mulx 16(up,n,8), %r13, %r12
- xor R32(%rax), R32(%rax)
- jmp L(lo0)
-
-L(b3): mulx %r8, %r11, %r10
- mulx 8(up,n,8), %r13, %r12
- mulx 16(up,n,8), %rbx, %rax
- add %r10, %r13
- adc %r12, %rbx
- adc $0, %rax
- sub $-3, n
- jz L(wd3)
- test R32(%rax), R32(%rax) C clear cy
- jmp L(lo3)
-
-L(b2): mulx %r8, %r13, %r12
- mulx 8(up,n,8), %rbx, %rax
- add %r12, %rbx
- adc $0, %rax
- sub $-2, n
- jz L(wd2)
- mulx (up,n,8), %r9, %r8
- test R32(%rax), R32(%rax) C clear cy
- jmp L(lo2)
-
-L(top): ADDSUB %r9, (rp,n,8)
-L(lo3): mulx (up,n,8), %r9, %r8
- ADCSBB %r11, 8(rp,n,8)
-L(lo2): mulx 8(up,n,8), %r11, %r10
- ADCSBB %r13, 16(rp,n,8)
-L(lo1): mulx 16(up,n,8), %r13, %r12
- ADCSBB %rbx, 24(rp,n,8)
- adc %rax, %r9
-L(lo0): mulx 24(up,n,8), %rbx, %rax
- adc %r8, %r11
- adc %r10, %r13
- adc %r12, %rbx
- adc $0, %rax C rax = carry limb
- add $4, n
- js L(top)
-
-L(end): ADDSUB %r9, (rp)
-L(wd3): ADCSBB %r11, 8(rp)
-L(wd2): ADCSBB %r13, 16(rp)
-L(wd1): ADCSBB %rbx, 24(rp)
- adc n, %rax
- pop %r13
- pop %r12
- pop %rbx
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/mulx/mul_1.asm b/gmp/mpn/x86_64/mulx/mul_1.asm
deleted file mode 100644
index 34a044dcdc..0000000000
--- a/gmp/mpn/x86_64/mulx/mul_1.asm
+++ /dev/null
@@ -1,154 +0,0 @@
-dnl AMD64 mpn_mul_1 for CPUs with mulx.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 -
-C AMD K10 -
-C AMD bd1 -
-C AMD bd2 ?
-C AMD bobcat -
-C AMD jaguar ?
-C Intel P4 -
-C Intel PNR -
-C Intel NHM -
-C Intel SBR -
-C Intel HWL ?
-C Intel BWL ?
-C Intel atom -
-C VIA nano -
-
-define(`rp', `%rdi') C rcx
-define(`up', `%rsi') C rdx
-define(`n_param', `%rdx') C r8
-define(`v0_param',`%rcx') C r9
-
-define(`n', `%rcx')
-define(`v0', `%rdx')
-
-IFDOS(` define(`up', ``%rsi'') ') dnl
-IFDOS(` define(`rp', ``%rcx'') ') dnl
-IFDOS(` define(`v0', ``%r9'') ') dnl
-IFDOS(` define(`r9', ``rdi'') ') dnl
-IFDOS(` define(`n', ``%r8'') ') dnl
-IFDOS(` define(`r8', ``r11'') ') dnl
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_mul_1c)
- jmp L(ent)
-EPILOGUE()
-PROLOGUE(mpn_mul_1)
- xor R32(%r8), R32(%r8) C carry-in limb
-L(ent): mov (up), %r9
-
- push %rbx
- push %r12
- push %r13
-
- lea (up,n_param,8), up
- lea -32(rp,n_param,8), rp
- mov R32(n_param), R32(%rax)
- xchg v0_param, v0 C FIXME: is this insn fast?
-
- neg n
-
- and $3, R8(%rax)
- jz L(b0)
- cmp $2, R8(%rax)
- jz L(b2)
- jg L(b3)
-
-L(b1): mov %r8, %r12
- mulx %r9, %rbx, %rax
- sub $-1, n
- jz L(wd1)
- mulx (up,n,8), %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- add %r12, %rbx
- jmp L(lo1)
-
-L(b3): mulx %r9, %r11, %r10
- mulx 8(up,n,8), %r13, %r12
- mulx 16(up,n,8), %rbx, %rax
- sub $-3, n
- jz L(wd3)
- add %r8, %r11
- jmp L(lo3)
-
-L(b2): mov %r8, %r10 C carry-in limb
- mulx %r9, %r13, %r12
- mulx 8(up,n,8), %rbx, %rax
- sub $-2, n
- jz L(wd2)
- mulx (up,n,8), %r9, %r8
- add %r10, %r13
- jmp L(lo2)
-
-L(b0): mov %r8, %rax C carry-in limb
- mulx %r9, %r9, %r8
- mulx 8(up,n,8), %r11, %r10
- mulx 16(up,n,8), %r13, %r12
- add %rax, %r9
- jmp L(lo0)
-
-L(top): jrcxz L(end)
- adc %r8, %r11
- mov %r9, (rp,n,8)
-L(lo3): mulx (up,n,8), %r9, %r8
- adc %r10, %r13
- mov %r11, 8(rp,n,8)
-L(lo2): mulx 8(up,n,8), %r11, %r10
- adc %r12, %rbx
- mov %r13, 16(rp,n,8)
-L(lo1): mulx 16(up,n,8), %r13, %r12
- adc %rax, %r9
- mov %rbx, 24(rp,n,8)
-L(lo0): mulx 24(up,n,8), %rbx, %rax
- lea 4(n), n
- jmp L(top)
-
-L(end): mov %r9, (rp)
-L(wd3): adc %r8, %r11
- mov %r11, 8(rp)
-L(wd2): adc %r10, %r13
- mov %r13, 16(rp)
-L(wd1): adc %r12, %rbx
- adc n, %rax
- mov %rbx, 24(rp)
-
- pop %r13
- pop %r12
- pop %rbx
- ret
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/nano/copyd.asm b/gmp/mpn/x86_64/nano/copyd.asm
deleted file mode 100644
index f0dc54a55e..0000000000
--- a/gmp/mpn/x86_64/nano/copyd.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyd)
-include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp/mpn/x86_64/nano/copyi.asm b/gmp/mpn/x86_64/nano/copyi.asm
deleted file mode 100644
index 9c26e00c52..0000000000
--- a/gmp/mpn/x86_64/nano/copyi.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_copyi)
-include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp/mpn/x86_64/nano/dive_1.asm b/gmp/mpn/x86_64/nano/dive_1.asm
deleted file mode 100644
index e9a07631c4..0000000000
--- a/gmp/mpn/x86_64/nano/dive_1.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
-
-dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C norm unorm
-C AMD K8,K9 11 11
-C AMD K10 11 11
-C Intel P4 ?
-C Intel core2 13.5 13.25
-C Intel corei 14.25
-C Intel atom 34 36
-C VIA nano 19.25 19.25
-
-
-C INPUT PARAMETERS
-C rp rdi
-C up rsi
-C n rdx
-C divisor rcx
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_divexact_1)
- FUNC_ENTRY(4)
- push %rbx
-
- mov %rcx, %rax
- xor R32(%rcx), R32(%rcx) C shift count
- mov %rdx, %r8
-
- bt $0, R32(%rax)
- jc L(odd) C skip bsfq unless divisor is even
- bsf %rax, %rcx
- shr R8(%rcx), %rax
-L(odd): mov %rax, %rbx
- shr R32(%rax)
- and $127, R32(%rax) C d/2, 7 bits
-
- LEA( binvert_limb_table, %rdx)
-
- movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
-
- mov %rbx, %r11 C d without twos
-
- lea (%rax,%rax), R32(%rdx) C 2*inv
- imul R32(%rax), R32(%rax) C inv*inv
- imul R32(%rbx), R32(%rax) C inv*inv*d
- sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
-
- lea (%rdx,%rdx), R32(%rax) C 2*inv
- imul R32(%rdx), R32(%rdx) C inv*inv
- imul R32(%rbx), R32(%rdx) C inv*inv*d
- sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
-
- lea (%rax,%rax), %r10 C 2*inv
- imul %rax, %rax C inv*inv
- imul %rbx, %rax C inv*inv*d
- sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
-
- lea (%rsi,%r8,8), %rsi C up end
- lea -8(%rdi,%r8,8), %rdi C rp end
- neg %r8 C -n
-
- mov (%rsi,%r8,8), %rax C up[0]
-
- inc %r8
- jz L(one)
-
- test R32(%rcx), R32(%rcx)
- jnz L(unorm) C branch if count != 0
- xor R32(%rbx), R32(%rbx)
- jmp L(nent)
-
- ALIGN(8)
-L(ntop):mul %r11 C carry limb in rdx 0 10
- mov -8(%rsi,%r8,8), %rax C
- sub %rbx, %rax C apply carry bit
- setc %bl C
- sub %rdx, %rax C apply carry limb 5
- adc $0, %rbx C 6
-L(nent):imul %r10, %rax C 6
- mov %rax, (%rdi,%r8,8) C
- inc %r8 C
- jnz L(ntop)
-
- mov -8(%rsi), %r9 C up high limb
- jmp L(com)
-
-L(unorm):
- mov (%rsi,%r8,8), %r9 C up[1]
- shr R8(%rcx), %rax C
- neg R32(%rcx)
- shl R8(%rcx), %r9 C
- neg R32(%rcx)
- or %r9, %rax
- xor R32(%rbx), R32(%rbx)
- jmp L(uent)
-
- ALIGN(8)
-L(utop):mul %r11 C carry limb in rdx 0 10
- mov (%rsi,%r8,8), %rax C
- shl R8(%rcx), %rax C
- neg R32(%rcx)
- or %r9, %rax
- sub %rbx, %rax C apply carry bit
- setc %bl C
- sub %rdx, %rax C apply carry limb 5
- adc $0, %rbx C 6
-L(uent):imul %r10, %rax C 6
- mov (%rsi,%r8,8), %r9 C
- shr R8(%rcx), %r9 C
- neg R32(%rcx)
- mov %rax, (%rdi,%r8,8) C
- inc %r8 C
- jnz L(utop)
-
-L(com): mul %r11 C carry limb in rdx
- sub %rbx, %r9 C apply carry bit
- sub %rdx, %r9 C apply carry limb
- imul %r10, %r9
- mov %r9, (%rdi)
- pop %rbx
- FUNC_EXIT()
- ret
-
-L(one): shr R8(%rcx), %rax
- imul %r10, %rax
- mov %rax, (%rdi)
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/nano/gcd_1.asm b/gmp/mpn/x86_64/nano/gcd_1.asm
deleted file mode 100644
index 3d8e5c7ab1..0000000000
--- a/gmp/mpn/x86_64/nano/gcd_1.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl AMD64 mpn_gcd_1.
-
-dnl Copyright 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
diff --git a/gmp/mpn/x86_64/nano/gmp-mparam.h b/gmp/mpn/x86_64/nano/gmp-mparam.h
deleted file mode 100644
index fde69dbb7f..0000000000
--- a/gmp/mpn/x86_64/nano/gmp-mparam.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* VIA Nano gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
-Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
-
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
-
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
-
-#define SHLD_SLOW 1
-#define SHRD_SLOW 1
-
-/* 1600 MHz Nano 2xxx */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 22
-
-#define MUL_TOOM22_THRESHOLD 27
-#define MUL_TOOM33_THRESHOLD 38
-#define MUL_TOOM44_THRESHOLD 324
-#define MUL_TOOM6H_THRESHOLD 450
-#define MUL_TOOM8H_THRESHOLD 632
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 211
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 315
-
-#define SQR_BASECASE_THRESHOLD 10
-#define SQR_TOOM2_THRESHOLD 52
-#define SQR_TOOM3_THRESHOLD 73
-#define SQR_TOOM4_THRESHOLD 387
-#define SQR_TOOM6_THRESHOLD 662
-#define SQR_TOOM8_THRESHOLD 781
-
-#define MULMID_TOOM42_THRESHOLD 32
-
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 15
-
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \
- { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
- { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
- { 49, 9}, { 27,10}, { 15, 9}, { 43,10}, \
- { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
- { 95,10}, { 79,11}, { 47,10}, { 103,12}, \
- { 31,11}, { 63,10}, { 143,11}, { 79,10}, \
- { 159, 9}, { 319,10}, { 175,11}, { 95, 9}, \
- { 383, 8}, { 767,10}, { 207,11}, { 111,12}, \
- { 63,11}, { 127,10}, { 255,11}, { 143, 9}, \
- { 575, 8}, { 1151,10}, { 303,11}, { 159,10}, \
- { 319, 9}, { 639, 8}, { 1279,10}, { 335,12}, \
- { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
- { 207,10}, { 415, 9}, { 831, 8}, { 1663,10}, \
- { 447,13}, { 63,12}, { 127,11}, { 255,10}, \
- { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \
- { 1087,10}, { 575, 9}, { 1215,12}, { 159,11}, \
- { 319,10}, { 639, 9}, { 1279,11}, { 335,10}, \
- { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \
- { 1535,10}, { 831, 9}, { 1663,12}, { 223,11}, \
- { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
- { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
- { 287,11}, { 575,10}, { 1151,11}, { 607,10}, \
- { 1215,12}, { 319,11}, { 639,10}, { 1279,11}, \
- { 671,10}, { 1343,12}, { 351,11}, { 703,10}, \
- { 1407,13}, { 191,12}, { 383,11}, { 767,10}, \
- { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \
- { 447,11}, { 895,10}, { 1791,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
- { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \
- { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
- { 671,11}, { 1343,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
- { 1663,13}, { 447,12}, { 895,11}, { 1791,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \
- { 575,12}, { 1151,11}, { 2303,12}, { 1215,13}, \
- { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \
- { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
- { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \
- { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \
- { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
- { 2303,13}, { 1215,14}, { 639,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
- { 767,13}, { 1535,12}, { 3071,13}, { 1663,14}, \
- { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \
- { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \
- { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
- { 2431,14}, { 1279,13}, { 2559,12}, { 5119,14}, \
- { 1407,13}, { 2815,12}, { 5631,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 224
-#define MUL_FFT_THRESHOLD 3520
-
-#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
- { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
- { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 63, 9}, { 127,10}, { 71, 9}, \
- { 143,10}, { 79,11}, { 47,10}, { 95, 9}, \
- { 191,10}, { 103,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 135, 7}, { 1087, 9}, \
- { 287,11}, { 79, 9}, { 319, 8}, { 639,10}, \
- { 167,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,11}, { 111,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511, 8}, { 1023,10}, { 271, 9}, \
- { 543, 8}, { 1087,11}, { 143, 9}, { 575, 8}, \
- { 1151,10}, { 303, 9}, { 639, 8}, { 1279,10}, \
- { 335, 9}, { 671,10}, { 351, 9}, { 703,12}, \
- { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
- { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \
- { 127,11}, { 255,10}, { 511, 9}, { 1023,11}, \
- { 271,10}, { 543, 9}, { 1087,10}, { 575, 9}, \
- { 1151,11}, { 303,10}, { 607, 9}, { 1215,12}, \
- { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \
- { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \
- { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \
- { 1535,11}, { 415,10}, { 831, 9}, { 1663,12}, \
- { 223,11}, { 447,10}, { 959,13}, { 127,12}, \
- { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
- { 1087,11}, { 575,10}, { 1215,12}, { 319,11}, \
- { 639,10}, { 1279,11}, { 671,10}, { 1343,12}, \
- { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \
- { 383,11}, { 767,10}, { 1535,12}, { 415,11}, \
- { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \
- { 1791,12}, { 479,11}, { 959,14}, { 127,12}, \
- { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
- { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
- { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
- { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
- { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \
- { 447,12}, { 895,11}, { 1791,12}, { 959,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \
- { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
- { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \
- { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \
- { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \
- { 2047,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
- { 639,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
- { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \
- { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
- { 1919,15}, { 511,14}, { 1023,13}, { 2047,12}, \
- { 4095,13}, { 2175,14}, { 1151,13}, { 2303,12}, \
- { 4607,14}, { 1279,13}, { 2559,14}, { 1407,13}, \
- { 2815,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 230
-#define SQR_FFT_THRESHOLD 2496
-
-#define MULLO_BASECASE_THRESHOLD 13
-#define MULLO_DC_THRESHOLD 38
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 56
-#define DC_DIVAPPR_Q_THRESHOLD 173
-#define DC_BDIV_QR_THRESHOLD 55
-#define DC_BDIV_Q_THRESHOLD 96
-
-#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 202
-#define INV_APPR_THRESHOLD 166
-
-#define BINV_NEWTON_THRESHOLD 246
-#define REDC_1_TO_REDC_2_THRESHOLD 7
-#define REDC_2_TO_REDC_N_THRESHOLD 85
-
-#define MU_DIV_QR_THRESHOLD 1499
-#define MU_DIVAPPR_Q_THRESHOLD 1652
-#define MUPI_DIV_QR_THRESHOLD 83
-#define MU_BDIV_QR_THRESHOLD 1210
-#define MU_BDIV_Q_THRESHOLD 1499
-
-#define POWM_SEC_TABLE 1,28,129,642,2387
-
-#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 127
-#define HGCD_APPR_THRESHOLD 214
-#define HGCD_REDUCE_THRESHOLD 2479
-#define GCD_DC_THRESHOLD 487
-#define GCDEXT_DC_THRESHOLD 505
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 802
-#define SET_STR_PRECOMPUTE_THRESHOLD 2042
-
-#define FAC_DSC_THRESHOLD 1737
-#define FAC_ODD_THRESHOLD 44
diff --git a/gmp/mpn/x86_64/nano/popcount.asm b/gmp/mpn/x86_64/nano/popcount.asm
deleted file mode 100644
index fb14dd3d31..0000000000
--- a/gmp/mpn/x86_64/nano/popcount.asm
+++ /dev/null
@@ -1,35 +0,0 @@
-dnl x86-64 mpn_popcount.
-
-dnl Copyright 2007, 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_popcount)
-include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/pentium4/aors_n.asm b/gmp/mpn/x86_64/pentium4/aors_n.asm
index 8e6ee1bae6..90f5a219b9 100644
--- a/gmp/mpn/x86_64/pentium4/aors_n.asm
+++ b/gmp/mpn/x86_64/pentium4/aors_n.asm
@@ -1,46 +1,30 @@
dnl x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.8
-C AMD K10 2.8
-C Intel P4 4
-C Intel core2 3.6-5 (fluctuating)
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
+C K8,K9: 2.8
+C K10: 2.8
+C P4: 4
+C P6-15: 3.6-5 (fluctuating)
C INPUT PARAMETERS
@@ -59,20 +43,19 @@ ifdef(`OPERATION_sub_n', `
define(func, mpn_sub_n)
define(func_nc, mpn_sub_nc)')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
ASM_START()
+
TEXT
+ ALIGN(16)
+
+PROLOGUE(func_nc)
+ jmp L(ent)
+EPILOGUE()
+
PROLOGUE(func)
- FUNC_ENTRY(4)
xor %r8, %r8
-IFDOS(` jmp L(ent) ')
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
L(ent): push %rbx
push %r12
@@ -191,6 +174,5 @@ L(1): mov %r11, 8(rp)
L(ret): mov R32(%rbx), R32(%rax)
pop %r12
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm b/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm
index 66937d3267..0723f3e6ca 100644
--- a/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm
+++ b/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm
@@ -1,50 +1,192 @@
dnl AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1),
-dnl optimised for Pentium 4.
+dnl optimized for Pentium 4.
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2008 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-define(LSH, 1)
-define(RSH, 31) C 31, not 63, since we use 32-bit ops
+C cycles/limb
+C K8,K9: 3.8
+C K10: 4.8
+C P4: 5.8
+C P6-15: ?
+
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
ifdef(`OPERATION_addlsh1_n', `
- define(ADDSUB, add)
- define(func, mpn_addlsh1_n)')
+ define(ADDSUB, add)
+ define(func, mpn_addlsh1_n)')
ifdef(`OPERATION_sublsh1_n', `
- define(ADDSUB, sub)
- define(func, mpn_sublsh1_n)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+ define(ADDSUB, sub)
+ define(func, mpn_sublsh1_n)')
MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
-include_mpn(`x86_64/pentium4/aorslshC_n.asm')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ push %rbx
+ push %r12
+ push %rbp
+
+ mov (vp), %r9
+ shl %r9
+ mov 4(vp), R32(%rbp)
+
+ xor R32(%rbx), R32(%rbx)
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jne L(n00) C n = 0, 4, 8, ...
+
+ mov (up), %r8
+ mov 8(up), %r10
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ lea -16(rp), rp
+ jmp L(L00)
+
+L(n00): cmp $2, R32(%rax)
+ jnc L(n01) C n = 1, 5, 9, ...
+ mov (up), %r11
+ lea -8(rp), rp
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ dec n
+ jz L(1) C jump for n = 1
+ mov 8(up), %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ mov 12(vp), R32(%rbp)
+ lea 8(up), up
+ lea 8(vp), vp
+ jmp L(L01)
+
+L(n01): jne L(n10) C n = 2, 6, 10, ...
+ mov (up), %r12
+ mov 8(up), %r11
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r12
+ mov 8(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ lea 16(up), up
+ lea 16(vp), vp
+ jmp L(L10)
+
+L(n10): mov (up), %r10
+ mov 8(up), %r12
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r10
+ mov 8(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rbx)
+ mov 12(vp), R32(%rbp)
+ lea -24(rp), rp
+ lea -8(up), up
+ lea -8(vp), vp
+ jmp L(L11)
+
+L(c0): mov $1, R8(%rbx)
+ jmp L(rc0)
+L(c1): mov $1, R8(%rax)
+ jmp L(rc1)
+L(c2): mov $1, R8(%rbx)
+ jmp L(rc2)
+
+ ALIGN(16)
+L(top): mov (up), %r8 C not on critical path
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r11 C not on critical path
+ mov (vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rbx) C save carry out
+ mov 4(vp), R32(%rbp)
+ mov %r12, (rp)
+ ADDSUB %rax, %r11 C apply previous carry out
+ jc L(c0) C jump if ripple
+L(rc0):
+L(L01): mov 8(up), %r10
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ mov %r11, 8(rp)
+ ADDSUB %rbx, %r8
+ jc L(c1)
+L(rc1):
+L(L00): mov 16(up), %r12
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r10
+ mov 16(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ setc R8(%rbx)
+ mov 20(vp), R32(%rbp)
+ mov %r8, 16(rp)
+ ADDSUB %rax, %r10
+ jc L(c2)
+L(rc2):
+L(L11): mov 24(up), %r11
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r12
+ mov 24(vp), %r9
+ lea (%rbp,%r9,2), %r9
+ lea 32(up), up
+ lea 32(vp), vp
+ setc R8(%rax)
+ mov -4(vp), R32(%rbp)
+ mov %r10, 24(rp)
+ ADDSUB %rbx, %r12
+ jc L(c3)
+L(rc3): lea 32(rp), rp
+L(L10): sub $4, n
+ ja L(top)
+
+L(end):
+ shr $31, R32(%rbp)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r12, (rp)
+ ADDSUB %rax, %r11
+ jnc L(1)
+ mov $1, R8(%rbx)
+L(1): mov %r11, 8(rp)
+ lea (%rbx,%rbp), R32(%rax)
+ pop %rbp
+ pop %r12
+ pop %rbx
+ emms
+ ret
+L(c3): mov $1, R8(%rax)
+ jmp L(rc3)
+EPILOGUE()
+ASM_END()
diff --git a/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm b/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm
deleted file mode 100644
index 001f0ac5bf..0000000000
--- a/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm
+++ /dev/null
@@ -1,50 +0,0 @@
-dnl AMD64 mpn_addlsh2_n, mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2),
-dnl optimised for Pentium 4.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-define(LSH, 2)
-define(RSH, 30) C 30, not 62, since we use 32-bit ops
-
-ifdef(`OPERATION_addlsh2_n', `
- define(ADDSUB, add)
- define(func, mpn_addlsh2_n)')
-ifdef(`OPERATION_sublsh2_n', `
- define(ADDSUB, sub)
- define(func, mpn_sublsh2_n)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
-include_mpn(`x86_64/pentium4/aorslshC_n.asm')
diff --git a/gmp/mpn/x86_64/pentium4/aorslshC_n.asm b/gmp/mpn/x86_64/pentium4/aorslshC_n.asm
deleted file mode 100644
index d03c6a3f30..0000000000
--- a/gmp/mpn/x86_64/pentium4/aorslshC_n.asm
+++ /dev/null
@@ -1,203 +0,0 @@
-dnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
-dnl C is 1, 2, 3. Optimized for Pentium 4.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-C cycles/limb
-C AMD K8,K9 3.8
-C AMD K10 3.8
-C Intel P4 5.8
-C Intel core2 4.75
-C Intel corei 4.75
-C Intel atom ?
-C VIA nano 4.75
-
-
-C INPUT PARAMETERS
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`vp',`%rdx')
-define(`n', `%rcx')
-
-define(M, eval(m4_lshift(1,LSH)))
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func)
- FUNC_ENTRY(4)
- push %rbx
- push %r12
- push %rbp
-
- mov (vp), %r9
- shl $LSH, %r9
- mov 4(vp), R32(%rbp)
-
- xor R32(%rbx), R32(%rbx)
-
- mov R32(n), R32(%rax)
- and $3, R32(%rax)
- jne L(n00) C n = 0, 4, 8, ...
-
- mov (up), %r8
- mov 8(up), %r10
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r8
- mov 8(vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rax)
- mov 12(vp), R32(%rbp)
- lea -16(rp), rp
- jmp L(L00)
-
-L(n00): cmp $2, R32(%rax)
- jnc L(n01) C n = 1, 5, 9, ...
- mov (up), %r11
- lea -8(rp), rp
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r11
- setc R8(%rbx)
- dec n
- jz L(1) C jump for n = 1
- mov 8(up), %r8
- mov 8(vp), %r9
- lea (%rbp,%r9,M), %r9
- mov 12(vp), R32(%rbp)
- lea 8(up), up
- lea 8(vp), vp
- jmp L(L01)
-
-L(n01): jne L(n10) C n = 2, 6, 10, ...
- mov (up), %r12
- mov 8(up), %r11
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r12
- mov 8(vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rax)
- mov 12(vp), R32(%rbp)
- lea 16(up), up
- lea 16(vp), vp
- jmp L(L10)
-
-L(n10): mov (up), %r10
- mov 8(up), %r12
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r10
- mov 8(vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rbx)
- mov 12(vp), R32(%rbp)
- lea -24(rp), rp
- lea -8(up), up
- lea -8(vp), vp
- jmp L(L11)
-
-L(c0): mov $1, R8(%rbx)
- jmp L(rc0)
-L(c1): mov $1, R8(%rax)
- jmp L(rc1)
-L(c2): mov $1, R8(%rbx)
- jmp L(rc2)
-
- ALIGN(16)
-L(top): mov (up), %r8 C not on critical path
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r11 C not on critical path
- mov (vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rbx) C save carry out
- mov 4(vp), R32(%rbp)
- mov %r12, (rp)
- ADDSUB %rax, %r11 C apply previous carry out
- jc L(c0) C jump if ripple
-L(rc0):
-L(L01): mov 8(up), %r10
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r8
- mov 8(vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rax)
- mov 12(vp), R32(%rbp)
- mov %r11, 8(rp)
- ADDSUB %rbx, %r8
- jc L(c1)
-L(rc1):
-L(L00): mov 16(up), %r12
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r10
- mov 16(vp), %r9
- lea (%rbp,%r9,M), %r9
- setc R8(%rbx)
- mov 20(vp), R32(%rbp)
- mov %r8, 16(rp)
- ADDSUB %rax, %r10
- jc L(c2)
-L(rc2):
-L(L11): mov 24(up), %r11
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r12
- mov 24(vp), %r9
- lea (%rbp,%r9,M), %r9
- lea 32(up), up
- lea 32(vp), vp
- setc R8(%rax)
- mov -4(vp), R32(%rbp)
- mov %r10, 24(rp)
- ADDSUB %rbx, %r12
- jc L(c3)
-L(rc3): lea 32(rp), rp
-L(L10): sub $4, n
- ja L(top)
-
-L(end):
- shr $RSH, R32(%rbp)
- ADDSUB %r9, %r11
- setc R8(%rbx)
- mov %r12, (rp)
- ADDSUB %rax, %r11
- jnc L(1)
- mov $1, R8(%rbx)
-L(1): mov %r11, 8(rp)
- lea (%rbx,%rbp), R32(%rax)
- pop %rbp
- pop %r12
- pop %rbx
- FUNC_EXIT()
- ret
-L(c3): mov $1, R8(%rax)
- jmp L(rc3)
-EPILOGUE()
-ASM_END()
diff --git a/gmp/mpn/x86_64/pentium4/gmp-mparam.h b/gmp/mpn/x86_64/pentium4/gmp-mparam.h
index 2171e230a5..ca9239775b 100644
--- a/gmp/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/gmp/mpn/x86_64/pentium4/gmp-mparam.h
@@ -1,231 +1,79 @@
/* Pentium 4-64 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2010, 2014 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
-or
-
- * the GNU General Public License as published by the Free Software
- Foundation; either version 2 of the License, or (at your option) any
- later version.
-
-or both in parallel, as here.
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
-You should have received copies of the GNU General Public License and the
-GNU Lesser General Public License along with the GNU MP Library. If not,
-see https://www.gnu.org/licenses/. */
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define GMP_LIMB_BITS 64
-#define GMP_LIMB_BYTES 8
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
/* These routines exists for all x86_64 chips, but they are slower on Pentium4
than separate add/sub and shift. Make sure they are not really used. */
-#undef HAVE_NATIVE_mpn_rsblsh1_n
-#undef HAVE_NATIVE_mpn_rsblsh2_n
-#undef HAVE_NATIVE_mpn_addlsh_n
-#undef HAVE_NATIVE_mpn_rsblsh_n
-
-/* 3400 MHz Pentium4 Nocona / 1024 Kibyte cache */
-/* FFT tuning limit = 25000000 */
-/* Generated by tuneup.c, 2014-03-12, gcc 4.5 */
-
-#define MOD_1_NORM_THRESHOLD 0 /* always */
-#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
-#define USE_PREINV_DIVREM_1 1 /* native */
-#define DIV_QR_1_NORM_THRESHOLD 1
-#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
-#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 20
-
-#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 41
-#define MUL_TOOM44_THRESHOLD 112
-#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 236
-
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
-
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 53
-#define SQR_TOOM4_THRESHOLD 154
-#define SQR_TOOM6_THRESHOLD 197
-#define SQR_TOOM8_THRESHOLD 296
-
-#define MULMID_TOOM42_THRESHOLD 22
-
-#define MULMOD_BNM1_THRESHOLD 9
-#define SQRMOD_BNM1_THRESHOLD 9
-
-#define MUL_FFT_MODF_THRESHOLD 252 /* k = 5 */
-#define MUL_FFT_TABLE3 \
- { { 252, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \
- { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \
- { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
- { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
- { 21, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
- { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 9}, { 39,10}, \
- { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
- { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \
- { 287,10}, { 79,11}, { 47,10}, { 95, 9}, \
- { 191,10}, { 103,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287,11}, \
- { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \
- { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
- { 223,12}, { 63,11}, { 127,10}, { 255,11}, \
- { 143,10}, { 287, 9}, { 575, 8}, { 1151,11}, \
- { 159,10}, { 319,11}, { 175,10}, { 351,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
- { 415,11}, { 223,13}, { 63,12}, { 127,11}, \
- { 255,10}, { 511,11}, { 287,10}, { 575, 9}, \
- { 1151,12}, { 159,11}, { 319,10}, { 639,11}, \
- { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
- { 767,11}, { 415,12}, { 223,11}, { 447,13}, \
- { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
- { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
- { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
- { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
- { 895,14}, { 127,13}, { 255,12}, { 511,11}, \
- { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \
- { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \
- { 1279,12}, { 703,11}, { 1407,10}, { 2815,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
- { 1663,13}, { 447,12}, { 895,14}, { 255,13}, \
- { 511,12}, { 1023,11}, { 2047,12}, { 1087,11}, \
- { 2175,13}, { 575,12}, { 1151,11}, { 2303,12}, \
- { 1215,11}, { 2431,10}, { 4863,13}, { 639,12}, \
- { 1279,11}, { 2559,13}, { 703,12}, { 1407,11}, \
- { 2815,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \
- { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
- { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \
- { 2431,11}, { 4863,14}, { 639,13}, { 1279,12}, \
- { 2559,13}, { 1407,12}, { 2815,14}, { 767,13}, \
- { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
- { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
- { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
- { 2431,12}, { 4863,14}, { 1279,13}, { 2559,14}, \
- { 1407,13}, { 2815,15}, { 32768,16}, { 65536,17}, \
- { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
- {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 211
-#define MUL_FFT_THRESHOLD 2240
-
-#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 212, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
- { 13, 7}, { 7, 6}, { 15, 7}, { 9, 6}, \
- { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
- { 9, 7}, { 20, 8}, { 11, 7}, { 24, 8}, \
- { 13, 9}, { 7, 8}, { 21, 9}, { 11, 8}, \
- { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
- { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
- { 15,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \
- { 159,11}, { 47, 9}, { 191,12}, { 31,11}, \
- { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
- { 287,11}, { 79,10}, { 159, 9}, { 319,10}, \
- { 175, 9}, { 351,10}, { 191, 9}, { 383,10}, \
- { 207,11}, { 111,10}, { 223,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319,11}, { 175,10}, \
- { 351,11}, { 191,10}, { 383,11}, { 223,13}, \
- { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
- { 287,10}, { 575,12}, { 159,11}, { 351,12}, \
- { 191,11}, { 383,12}, { 223,11}, { 447,13}, \
- { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
- { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
- { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
- { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
- { 255,12}, { 511,11}, { 1023,10}, { 2047,11}, \
- { 1087,12}, { 575,11}, { 1151,13}, { 319,12}, \
- { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
- { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
- { 447,14}, { 255,13}, { 511,12}, { 1023,11}, \
- { 2047,13}, { 575,12}, { 1151,11}, { 2303,12}, \
- { 1215,13}, { 639,12}, { 1279,11}, { 2559,13}, \
- { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
- { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \
- { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
- { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \
- { 2431,14}, { 639,13}, { 1279,12}, { 2687,13}, \
- { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \
- { 895,13}, { 1791,12}, { 3583,15}, { 511,14}, \
- { 1023,13}, { 2175,14}, { 1151,13}, { 2303,12}, \
- { 4607,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
- { 2559,14}, { 1407,13}, { 2815,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 184
-#define SQR_FFT_THRESHOLD 1984
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 4392
-
-#define DC_DIV_QR_THRESHOLD 35
-#define DC_DIVAPPR_Q_THRESHOLD 68
-#define DC_BDIV_QR_THRESHOLD 32
-#define DC_BDIV_Q_THRESHOLD 56
-
-#define INV_MULMOD_BNM1_THRESHOLD 22
-#define INV_NEWTON_THRESHOLD 195
-#define INV_APPR_THRESHOLD 116
-
-#define BINV_NEWTON_THRESHOLD 199
-#define REDC_1_TO_REDC_2_THRESHOLD 4
-#define REDC_2_TO_REDC_N_THRESHOLD 42
-
-#define MU_DIV_QR_THRESHOLD 979
-#define MU_DIVAPPR_Q_THRESHOLD 979
-#define MUPI_DIV_QR_THRESHOLD 91
-#define MU_BDIV_QR_THRESHOLD 855
-#define MU_BDIV_Q_THRESHOLD 942
-
-#define POWM_SEC_TABLE 1,16,175,692,1603
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 109
-#define HGCD_APPR_THRESHOLD 119
-#define HGCD_REDUCE_THRESHOLD 1679
-#define GCD_DC_THRESHOLD 222
-#define GCDEXT_DC_THRESHOLD 238
-#define JACOBI_BASE_METHOD 4
-
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 537
-#define SET_STR_PRECOMPUTE_THRESHOLD 1430
-
-#define FAC_DSC_THRESHOLD 1127
-#define FAC_ODD_THRESHOLD 0 /* always */
+#undef HAVE_NATIVE_mpn_addlsh1_n
+#undef HAVE_NATIVE_mpn_sublsh1_n
+#undef HAVE_NATIVE_mpn_rsh1add_n
+#undef HAVE_NATIVE_mpn_rsh1sub_n
+
+/* 3200 MHz Pentium / 2048 Kibyte cache / socket 775 */
+
+/* Generated by tuneup.c, 2009-01-15, gcc 3.4 */
+
+#define MUL_KARATSUBA_THRESHOLD 12
+#define MUL_TOOM3_THRESHOLD 91
+#define MUL_TOOM44_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_KARATSUBA_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 218
+
+#define MULLOW_BASECASE_THRESHOLD 0 /* always */
+#define MULLOW_DC_THRESHOLD 28
+#define MULLOW_MUL_N_THRESHOLD 246
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 35
+#define POWM_THRESHOLD 59
+
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD_THRESHOLD 112
+#define GCD_DC_THRESHOLD 258
+#define GCDEXT_DC_THRESHOLD 311
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 5
+#define MOD_1_2_THRESHOLD 7
+#define MOD_1_4_THRESHOLD 28
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define USE_PREINV_MOD_1 1
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 866
+#define SET_STR_PRECOMPUTE_THRESHOLD 1646
+
+#define MUL_FFT_TABLE { 240, 416, 1216, 2304, 7168, 20480, 49152, 196608, 786432, 0 }
+#define MUL_FFT_MODF_THRESHOLD 256
+#define MUL_FFT_THRESHOLD 2944
+
+#define SQR_FFT_TABLE { 208, 480, 1600, 2304, 7168, 20480, 49152, 196608, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 224
+#define SQR_FFT_THRESHOLD 2688
diff --git a/gmp/mpn/x86_64/pentium4/lshift.asm b/gmp/mpn/x86_64/pentium4/lshift.asm
index d3b521364f..7596d9c5c0 100644
--- a/gmp/mpn/x86_64/pentium4/lshift.asm
+++ b/gmp/mpn/x86_64/pentium4/lshift.asm
@@ -1,44 +1,31 @@
dnl x86-64 mpn_lshift optimized for Pentium 4.
-dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.5
-C AMD K10 ?
-C Intel P4 3.29
-C Intel core2 2.1 (fluctuates, presumably cache related)
-C Intel corei ?
-C Intel atom 14.3
-C VIA nano ?
+C K8,K9: 2.5
+C K10: ?
+C P4: 3.29
+C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
+C P6-28 (Atom): 14.3
C INPUT PARAMETERS
define(`rp',`%rdi')
@@ -46,28 +33,24 @@ define(`up',`%rsi')
define(`n',`%rdx')
define(`cnt',`%cl')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshift)
- FUNC_ENTRY(4)
mov -8(up,n,8), %rax
- movd R32(%rcx), %mm4
- neg R32(%rcx) C put rsh count in cl
- and $63, R32(%rcx)
- movd R32(%rcx), %mm5
+ movd %ecx, %mm4
+ neg %ecx C put rsh count in cl
+ and $63, %ecx
+ movd %ecx, %mm5
- lea 1(n), R32(%r8)
+ lea 1(n), %r8d
- shr R8(%rcx), %rax C function return value
+ shr %cl, %rax C function return value
- and $3, R32(%r8)
+ and $3, %r8d
je L(rol) C jump for n = 3, 7, 11, ...
- dec R32(%r8)
+ dec %r8d
jne L(1)
C n = 4, 8, 12, ...
movq -8(up,n,8), %mm2
@@ -79,7 +62,7 @@ C n = 4, 8, 12, ...
dec n
jmp L(rol)
-L(1): dec R32(%r8)
+L(1): dec %r8d
je L(1x) C jump for n = 1, 5, 9, 13, ...
C n = 2, 6, 10, 16, ...
movq -8(up,n,8), %mm2
@@ -148,19 +131,18 @@ L(top):
jae L(top) C 2
L(end):
- movq 8(up), %mm0
+ movq 16(up,n,8), %mm0
psrlq %mm5, %mm0
por %mm0, %mm2
- movq (up), %mm1
+ movq 8(up,n,8), %mm1
psrlq %mm5, %mm1
por %mm1, %mm3
- movq %mm2, 16(rp)
- movq %mm3, 8(rp)
+ movq %mm2, 24(rp,n,8)
+ movq %mm3, 16(rp,n,8)
L(ast): movq (up), %mm2
psllq %mm4, %mm2
movq %mm2, (rp)
emms
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/lshiftc.asm b/gmp/mpn/x86_64/pentium4/lshiftc.asm
deleted file mode 100644
index fc64676574..0000000000
--- a/gmp/mpn/x86_64/pentium4/lshiftc.asm
+++ /dev/null
@@ -1,179 +0,0 @@
-dnl x86-64 mpn_lshiftc optimized for Pentium 4.
-
-dnl Copyright 2003, 2005, 2007, 2008, 2010, 2012 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 ?
-C AMD K10 ?
-C Intel P4 4.15
-C Intel core2 ?
-C Intel corei ?
-C Intel atom ?
-C VIA nano ?
-
-C INPUT PARAMETERS
-define(`rp',`%rdi')
-define(`up',`%rsi')
-define(`n',`%rdx')
-define(`cnt',`%cl')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_lshiftc)
- FUNC_ENTRY(4)
- mov -8(up,n,8), %rax
- pcmpeqd %mm6, %mm6 C 0xffff...fff
- movd R32(%rcx), %mm4
- neg R32(%rcx) C put rsh count in cl
- and $63, R32(%rcx)
- movd R32(%rcx), %mm5
-
- lea 1(n), R32(%r8)
-
- shr R8(%rcx), %rax C function return value
-
- and $3, R32(%r8)
- je L(rol) C jump for n = 3, 7, 11, ...
-
- dec R32(%r8)
- jne L(1)
-C n = 4, 8, 12, ...
- movq -8(up,n,8), %mm2
- psllq %mm4, %mm2
- movq -16(up,n,8), %mm0
- pxor %mm6, %mm2
- psrlq %mm5, %mm0
- pandn %mm2, %mm0
- movq %mm0, -8(rp,n,8)
- dec n
- jmp L(rol)
-
-L(1): dec R32(%r8)
- je L(1x) C jump for n = 1, 5, 9, 13, ...
-C n = 2, 6, 10, 16, ...
- movq -8(up,n,8), %mm2
- psllq %mm4, %mm2
- movq -16(up,n,8), %mm0
- pxor %mm6, %mm2
- psrlq %mm5, %mm0
- pandn %mm2, %mm0
- movq %mm0, -8(rp,n,8)
- dec n
-L(1x):
- cmp $1, n
- je L(ast)
- movq -8(up,n,8), %mm2
- psllq %mm4, %mm2
- movq -16(up,n,8), %mm3
- psllq %mm4, %mm3
- movq -16(up,n,8), %mm0
- movq -24(up,n,8), %mm1
- pxor %mm6, %mm2
- psrlq %mm5, %mm0
- pandn %mm2, %mm0
- pxor %mm6, %mm3
- psrlq %mm5, %mm1
- pandn %mm3, %mm1
- movq %mm0, -8(rp,n,8)
- movq %mm1, -16(rp,n,8)
- sub $2, n
-
-L(rol): movq -8(up,n,8), %mm2
- psllq %mm4, %mm2
- movq -16(up,n,8), %mm3
- psllq %mm4, %mm3
-
- sub $4, n
- jb L(end)
- ALIGN(32)
-L(top):
- C finish stuff from lsh block
- movq 16(up,n,8), %mm0
- pxor %mm6, %mm2
- movq 8(up,n,8), %mm1
- psrlq %mm5, %mm0
- psrlq %mm5, %mm1
- pandn %mm2, %mm0
- pxor %mm6, %mm3
- movq %mm0, 24(rp,n,8)
- movq (up,n,8), %mm0
- pandn %mm3, %mm1
- movq %mm1, 16(rp,n,8)
- movq -8(up,n,8), %mm1
- C start two new rsh
- psrlq %mm5, %mm0
- psrlq %mm5, %mm1
-
- C finish stuff from rsh block
- movq 8(up,n,8), %mm2
- pxor %mm6, %mm0
- movq (up,n,8), %mm3
- psllq %mm4, %mm2
- psllq %mm4, %mm3
- pandn %mm0, %mm2
- pxor %mm6, %mm1
- movq %mm2, 8(rp,n,8)
- movq -8(up,n,8), %mm2
- pandn %mm1, %mm3
- movq %mm3, (rp,n,8)
- movq -16(up,n,8), %mm3
- C start two new lsh
- sub $4, n
- psllq %mm4, %mm2
- psllq %mm4, %mm3
-
- jae L(top)
-
-L(end): pxor %mm6, %mm2
- movq 8(up), %mm0
- psrlq %mm5, %mm0
- pandn %mm2, %mm0
- pxor %mm6, %mm3
- movq (up), %mm1
- psrlq %mm5, %mm1
- pandn %mm3, %mm1
- movq %mm0, 16(rp)
- movq %mm1, 8(rp)
-
-L(ast): movq (up), %mm2
- psllq %mm4, %mm2
- pxor %mm6, %mm2
- movq %mm2, (rp)
- emms
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm b/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm
deleted file mode 100644
index f34b3f079a..0000000000
--- a/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm
+++ /dev/null
@@ -1,167 +0,0 @@
-dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
-
-dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
-dnl Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 1.0
-C AMD K10 1.12
-C Intel P4 3.25
-C Intel core2 1.5
-C Intel corei 1.5
-C Intel atom 2.5
-C VIA nano 1.75
-
-
-C INPUT PARAMETERS
-define(`ap', %rdi)
-define(`n', %rsi)
-
-C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
-
-C TODO
-C * Review feed-in and wind-down code. In particular, try to avoid adc and
-C sbb to placate Pentium4.
-C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
-C without the dual loop exits.
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_mod_34lsub1)
- FUNC_ENTRY(2)
-
- mov $0x0000FFFFFFFFFFFF, %r11
-
- sub $2, %rsi
- ja L(gt2)
-
- mov (ap), %rax
- nop
- jb L(1)
-
- mov 8(ap), %rsi
- mov %rax, %rdx
- shr $48, %rax C src[0] low
-
- and %r11, %rdx C src[0] high
- add %rdx, %rax
- mov R32(%rsi), R32(%rdx)
-
- shr $32, %rsi C src[1] high
- add %rsi, %rax
-
- shl $16, %rdx C src[1] low
- add %rdx, %rax
-
-L(1): FUNC_EXIT()
- ret
-
-
- ALIGN(16)
-L(gt2): xor R32(%rax), R32(%rax)
- xor R32(%rcx), R32(%rcx)
- xor R32(%rdx), R32(%rdx)
- xor %r8, %r8
- xor %r9, %r9
- xor %r10, %r10
-
-L(top): add (ap), %rax
- adc $0, %r10
- add 8(ap), %rcx
- adc $0, %r8
- add 16(ap), %rdx
- adc $0, %r9
-
- sub $3, %rsi
- jng L(end)
-
- add 24(ap), %rax
- adc $0, %r10
- add 32(ap), %rcx
- adc $0, %r8
- add 40(ap), %rdx
- lea 48(ap), ap
- adc $0, %r9
-
- sub $3, %rsi
- jg L(top)
-
-
- add $-24, ap
-L(end): add %r9, %rax
- adc %r10, %rcx
- adc %r8, %rdx
-
- inc %rsi
- mov $0x1, R32(%r10)
- js L(combine)
-
- mov $0x10000, R32(%r10)
- adc 24(ap), %rax
- dec %rsi
- js L(combine)
-
- adc 32(ap), %rcx
- mov $0x100000000, %r10
-
-L(combine):
- sbb %rsi, %rsi C carry
- mov %rax, %rdi C 0mod3
- shr $48, %rax C 0mod3 high
-
- and %r10, %rsi C carry masked
- and %r11, %rdi C 0mod3 low
- mov R32(%rcx), R32(%r10) C 1mod3
-
- add %rsi, %rax C apply carry
- shr $32, %rcx C 1mod3 high
-
- add %rdi, %rax C apply 0mod3 low
- movzwl %dx, R32(%rdi) C 2mod3
- shl $16, %r10 C 1mod3 low
-
- add %rcx, %rax C apply 1mod3 high
- shr $16, %rdx C 2mod3 high
-
- add %r10, %rax C apply 1mod3 low
- shl $32, %rdi C 2mod3 low
-
- add %rdx, %rax C apply 2mod3 high
- add %rdi, %rax C apply 2mod3 low
-
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/popcount.asm b/gmp/mpn/x86_64/pentium4/popcount.asm
index 7014b39de5..b1a866bf5e 100644
--- a/gmp/mpn/x86_64/pentium4/popcount.asm
+++ b/gmp/mpn/x86_64/pentium4/popcount.asm
@@ -3,33 +3,21 @@ dnl x86-64 mpn_popcount optimized for Pentium 4.
dnl Copyright 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-MULFUNC_PROLOGUE(mpn_popcount)
include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm b/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm
deleted file mode 100644
index 5528ce47da..0000000000
--- a/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm
+++ /dev/null
@@ -1,334 +0,0 @@
-dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
-
-dnl Contributed to the GNU project by Torbjorn Granlund.
-
-dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C AMD K8,K9 4.13
-C AMD K10 4.13
-C Intel P4 5.70
-C Intel core2 4.75
-C Intel corei 5
-C Intel atom 8.75
-C VIA nano 5.25
-
-C TODO
-C * Try to make this smaller, 746 bytes seem excessive for this 2nd class
-C function. Less sw pipelining would help, and since we now probably
-C pipeline somewhat too deeply, it might not affect performance too much.
-C * A separate small-n loop might speed things as well as make things smaller.
-C That loop should be selected before pushing registers.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(func, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(func, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ASM_START()
- TEXT
-PROLOGUE(func)
- FUNC_ENTRY(4)
- xor %r8, %r8
-IFDOS(` jmp L(ent) ')
-EPILOGUE()
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
-L(ent): push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov (vp), %r9
- mov (up), %r15
-
- mov R32(n), R32(%rax)
- and $3, R32(%rax)
- jne L(n00)
-
- mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ...
- mov 8(up), %r10
- ADDSUB %r9, %r15
- mov 8(vp), %r9
- setc R8(%rax)
- ADDSUB %rbx, %r15 C return bit
- jnc 1f
- mov $1, R8(%rax)
-1: mov 16(up), %r12
- ADDSUB %r9, %r10
- mov 16(vp), %r9
- setc R8(%rbx)
- mov %r15, %r13
- ADDSUB %rax, %r10
- jnc 1f
- mov $1, R8(%rbx)
-1: mov 24(up), %r11
- ADDSUB %r9, %r12
- lea 32(up), up
- mov 24(vp), %r9
- lea 32(vp), vp
- setc R8(%rax)
- mov %r10, %r14
- shl $63, %r10
- shr %r13
- jmp L(L00)
-
-L(n00): cmp $2, R32(%rax)
- jnc L(n01)
- xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ...
- lea -24(rp), rp
- mov R32(%r8), R32(%rax)
- dec n
- jnz L(gt1)
- ADDSUB %r9, %r15
- setc R8(%rbx)
- ADDSUB %rax, %r15
- jnc 1f
- mov $1, R8(%rbx)
-1: mov %r15, %r14
- shl $63, %rbx
- shr %r14
- jmp L(cj1)
-L(gt1): mov 8(up), %r8
- ADDSUB %r9, %r15
- mov 8(vp), %r9
- setc R8(%rbx)
- ADDSUB %rax, %r15
- jnc 1f
- mov $1, R8(%rbx)
-1: mov 16(up), %r10
- ADDSUB %r9, %r8
- mov 16(vp), %r9
- setc R8(%rax)
- mov %r15, %r14
- ADDSUB %rbx, %r8
- jnc 1f
- mov $1, R8(%rax)
-1: mov 24(up), %r12
- ADDSUB %r9, %r10
- mov 24(vp), %r9
- setc R8(%rbx)
- mov %r8, %r13
- shl $63, %r8
- shr %r14
- lea 8(up), up
- lea 8(vp), vp
- jmp L(L01)
-
-L(n01): jne L(n10)
- lea -16(rp), rp C n = 2, 6, 10, ...
- mov R32(%r8), R32(%rbx)
- mov 8(up), %r11
- ADDSUB %r9, %r15
- mov 8(vp), %r9
- setc R8(%rax)
- ADDSUB %rbx, %r15
- jnc 1f
- mov $1, R8(%rax)
-1: sub $2, n
- jnz L(gt2)
- ADDSUB %r9, %r11
- setc R8(%rbx)
- mov %r15, %r13
- ADDSUB %rax, %r11
- jnc 1f
- mov $1, R8(%rbx)
-1: mov %r11, %r14
- shl $63, %r11
- shr %r13
- jmp L(cj2)
-L(gt2): mov 16(up), %r8
- ADDSUB %r9, %r11
- mov 16(vp), %r9
- setc R8(%rbx)
- mov %r15, %r13
- ADDSUB %rax, %r11
- jnc 1f
- mov $1, R8(%rbx)
-1: mov 24(up), %r10
- ADDSUB %r9, %r8
- mov 24(vp), %r9
- setc R8(%rax)
- mov %r11, %r14
- shl $63, %r11
- shr %r13
- lea 16(up), up
- lea 16(vp), vp
- jmp L(L10)
-
-L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ...
- lea -8(rp), rp
- mov R32(%r8), R32(%rax)
- mov 8(up), %r12
- ADDSUB %r9, %r15
- mov 8(vp), %r9
- setc R8(%rbx)
- ADDSUB %rax, %r15
- jnc 1f
- mov $1, R8(%rbx)
-1: mov 16(up), %r11
- ADDSUB %r9, %r12
- mov 16(vp), %r9
- setc R8(%rax)
- mov %r15, %r14
- ADDSUB %rbx, %r12
- jnc 1f
- mov $1, R8(%rax)
-1: sub $3, n
- jnz L(gt3)
- ADDSUB %r9, %r11
- setc R8(%rbx)
- mov %r12, %r13
- shl $63, %r12
- shr %r14
- jmp L(cj3)
-L(gt3): mov 24(up), %r8
- ADDSUB %r9, %r11
- mov 24(vp), %r9
- setc R8(%rbx)
- mov %r12, %r13
- shl $63, %r12
- shr %r14
- lea 24(up), up
- lea 24(vp), vp
- jmp L(L11)
-
-L(c0): mov $1, R8(%rbx)
- jmp L(rc0)
-L(c1): mov $1, R8(%rax)
- jmp L(rc1)
-L(c2): mov $1, R8(%rbx)
- jmp L(rc2)
-
- ALIGN(16)
-L(top): mov (up), %r8 C not on critical path
- or %r13, %r10
- ADDSUB %r9, %r11 C not on critical path
- mov (vp), %r9 C not on critical path
- setc R8(%rbx) C save carry out
- mov %r12, %r13 C new for later
- shl $63, %r12 C shift new right
- shr %r14 C shift old left
- mov %r10, (rp)
-L(L11): ADDSUB %rax, %r11 C apply previous carry out
- jc L(c0) C jump if ripple
-L(rc0): mov 8(up), %r10
- or %r14, %r12
- ADDSUB %r9, %r8
- mov 8(vp), %r9
- setc R8(%rax)
- mov %r11, %r14
- shl $63, %r11
- shr %r13
- mov %r12, 8(rp)
-L(L10): ADDSUB %rbx, %r8
- jc L(c1)
-L(rc1): mov 16(up), %r12
- or %r13, %r11
- ADDSUB %r9, %r10
- mov 16(vp), %r9
- setc R8(%rbx)
- mov %r8, %r13
- shl $63, %r8
- shr %r14
- mov %r11, 16(rp)
-L(L01): ADDSUB %rax, %r10
- jc L(c2)
-L(rc2): mov 24(up), %r11
- or %r14, %r8
- ADDSUB %r9, %r12
- lea 32(up), up
- mov 24(vp), %r9
- lea 32(vp), vp
- setc R8(%rax)
- mov %r10, %r14
- shl $63, %r10
- shr %r13
- mov %r8, 24(rp)
- lea 32(rp), rp
-L(L00): ADDSUB %rbx, %r12
- jc L(c3)
-L(rc3): sub $4, n
- ja L(top)
-
-L(end): or %r13, %r10
- ADDSUB %r9, %r11
- setc R8(%rbx)
- mov %r12, %r13
- shl $63, %r12
- shr %r14
- mov %r10, (rp)
-L(cj3): ADDSUB %rax, %r11
- jnc 1f
- mov $1, R8(%rbx)
-1: or %r14, %r12
- mov %r11, %r14
- shl $63, %r11
- shr %r13
- mov %r12, 8(rp)
-L(cj2): or %r13, %r11
- shl $63, %rbx
- shr %r14
- mov %r11, 16(rp)
-L(cj1): or %r14, %rbx
- mov %rbx, 24(rp)
-
- mov R32(%r15), R32(%rax)
- and $1, R32(%rax)
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
- FUNC_EXIT()
- ret
-L(c3): mov $1, R8(%rax)
- jmp L(rc3)
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/rshift.asm b/gmp/mpn/x86_64/pentium4/rshift.asm
index b7c1ee2cdd..61899c5ecf 100644
--- a/gmp/mpn/x86_64/pentium4/rshift.asm
+++ b/gmp/mpn/x86_64/pentium4/rshift.asm
@@ -1,44 +1,31 @@
dnl x86-64 mpn_rshift optimized for Pentium 4.
-dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.5
-C AMD K10 ?
-C Intel P4 3.29
-C Intel core2 2.1 (fluctuates, presumably cache related)
-C Intel corei ?
-C Intel atom 14.3
-C VIA nano ?
+C K8,K9: 2.5
+C K10: ?
+C P4: 3.29
+C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
+C P6-28 (Atom): 14.3
C INPUT PARAMETERS
define(`rp',`%rdi')
@@ -46,31 +33,27 @@ define(`up',`%rsi')
define(`n',`%rdx')
define(`cnt',`%cl')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
mov (up), %rax
- movd R32(%rcx), %mm4
- neg R32(%rcx) C put lsh count in cl
- and $63, R32(%rcx)
- movd R32(%rcx), %mm5
+ movd %ecx, %mm4
+ neg %ecx C put lsh count in cl
+ and $63, %ecx
+ movd %ecx, %mm5
lea -8(up,n,8), up
lea -8(rp,n,8), rp
- lea 1(n), R32(%r8)
+ lea 1(n), %r8d
neg n
- shl R8(%rcx), %rax C function return value
+ shl %cl, %rax C function return value
- and $3, R32(%r8)
+ and $3, %r8d
je L(rol) C jump for n = 3, 7, 11, ...
- dec R32(%r8)
+ dec %r8d
jne L(1)
C n = 4, 8, 12, ...
movq 8(up,n,8), %mm2
@@ -82,7 +65,7 @@ C n = 4, 8, 12, ...
inc n
jmp L(rol)
-L(1): dec R32(%r8)
+L(1): dec %r8d
je L(1x) C jump for n = 1, 5, 9, 13, ...
C n = 2, 6, 10, 16, ...
movq 8(up,n,8), %mm2
@@ -151,19 +134,18 @@ L(top):
jae L(top) C 2
L(end):
- movq -8(up), %mm0
+ movq -16(up,n,8), %mm0
psllq %mm5, %mm0
por %mm0, %mm2
- movq (up), %mm1
+ movq -8(up,n,8), %mm1
psllq %mm5, %mm1
por %mm1, %mm3
- movq %mm2, -16(rp)
- movq %mm3, -8(rp)
+ movq %mm2, -24(rp,n,8)
+ movq %mm3, -16(rp,n,8)
L(ast): movq (up), %mm2
psrlq %mm4, %mm2
movq %mm2, (rp)
emms
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/pentium4/sec_tabselect.asm b/gmp/mpn/x86_64/pentium4/sec_tabselect.asm
deleted file mode 100644
index e4360341d9..0000000000
--- a/gmp/mpn/x86_64/pentium4/sec_tabselect.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl X86-64 mpn_sec_tabselect.
-
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_sec_tabselect)
-include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp/mpn/x86_64/popham.asm b/gmp/mpn/x86_64/popham.asm
index 9005f81776..e2bdb1a0b8 100644
--- a/gmp/mpn/x86_64/popham.asm
+++ b/gmp/mpn/x86_64/popham.asm
@@ -1,32 +1,21 @@
dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
-dnl Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -34,13 +23,10 @@ include(`../config.m4')
C popcount hamdist
C cycles/limb cycles/limb
-C AMD K8,K9 6 7
-C AMD K10 6 7
-C Intel P4 12 14.3
-C Intel core2 7 8
-C Intel corei ? 7.3
-C Intel atom 16.5 17.5
-C VIA nano 8.75 10.4
+C K8,K9: 6 7
+C K10: 6 7
+C P4: 12 14.3
+C P6-15: 7 8
C TODO
C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for
@@ -55,7 +41,6 @@ ifdef(`OPERATION_popcount',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%rdx')
- define(`POP', `$1')
define(`HAM', `dnl')
')
ifdef(`OPERATION_hamdist',`
@@ -67,111 +52,106 @@ ifdef(`OPERATION_hamdist',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%r14')
- define(`POP', `dnl')
define(`HAM', `$1')
')
MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func)
- POP(` FUNC_ENTRY(2) ')
- HAM(` FUNC_ENTRY(3) ')
- push %r12
- push %r13
- HAM(` push %r14 ')
- mov $0x5555555555555555, h55555555
- mov $0x3333333333333333, h33333333
- mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
- mov $0x0101010101010101, h01010101
+ pushq %r12
+ pushq %r13
+ HAM(` pushq %r14 ')
+
+ movq $0x5555555555555555, h55555555
+ movq $0x3333333333333333, h33333333
+ movq $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
+ movq $0x0101010101010101, h01010101
- lea (up,n,8), up
- HAM(` lea (vp,n,8), vp ')
- neg n
+ leaq (up,n,8), up
+ HAM(` leaq (vp,n,8), vp ')
+ negq n
- xor R32(%rax), R32(%rax)
+ xorl %eax, %eax
- bt $0, R32(n)
- jnc L(top)
+ btq $0, n
+ jnc L(oop)
- mov (up,n,8), %r8
- HAM(` xor (vp,n,8), %r8 ')
+ movq (up,n,8), %r8
+ HAM(` xorq (vp,n,8), %r8 ')
- mov %r8, %r9
- shr %r8
- and h55555555, %r8
- sub %r8, %r9
+ movq %r8, %r9
+ shrq %r8
+ andq h55555555, %r8
+ subq %r8, %r9
- mov %r9, %r8
- shr $2, %r9
- and h33333333, %r8
- and h33333333, %r9
- add %r8, %r9 C 16 4-bit fields (0..4)
+ movq %r9, %r8
+ shrq $2, %r9
+ andq h33333333, %r8
+ andq h33333333, %r9
+ addq %r8, %r9 C 16 4-bit fields (0..4)
- mov %r9, %r8
- shr $4, %r9
- and h0f0f0f0f, %r8
- and h0f0f0f0f, %r9
- add %r8, %r9 C 8 8-bit fields (0..16)
+ movq %r9, %r8
+ shrq $4, %r9
+ andq h0f0f0f0f, %r8
+ andq h0f0f0f0f, %r9
+ addq %r8, %r9 C 8 8-bit fields (0..16)
- imul h01010101, %r9 C sum the 8 fields in high 8 bits
- shr $56, %r9
+ imulq h01010101, %r9 C sum the 8 fields in high 8 bits
+ shrq $56, %r9
- mov %r9, %rax C add to total
- add $1, n
- jz L(end)
+ addq %r9, %rax C add to total
+ addq $1, n
+ jz L(done)
ALIGN(16)
-L(top): mov (up,n,8), %r8
- mov 8(up,n,8), %r12
- HAM(` xor (vp,n,8), %r8 ')
- HAM(` xor 8(vp,n,8), %r12 ')
-
- mov %r8, %r9
- mov %r12, %r13
- shr %r8
- shr %r12
- and h55555555, %r8
- and h55555555, %r12
- sub %r8, %r9
- sub %r12, %r13
-
- mov %r9, %r8
- mov %r13, %r12
- shr $2, %r9
- shr $2, %r13
- and h33333333, %r8
- and h33333333, %r9
- and h33333333, %r12
- and h33333333, %r13
- add %r8, %r9 C 16 4-bit fields (0..4)
- add %r12, %r13 C 16 4-bit fields (0..4)
-
- add %r13, %r9 C 16 4-bit fields (0..8)
- mov %r9, %r8
- shr $4, %r9
- and h0f0f0f0f, %r8
- and h0f0f0f0f, %r9
- add %r8, %r9 C 8 8-bit fields (0..16)
-
- imul h01010101, %r9 C sum the 8 fields in high 8 bits
- shr $56, %r9
-
- add %r9, %rax C add to total
- add $2, n
- jnc L(top)
-
-L(end):
- HAM(` pop %r14 ')
- pop %r13
- pop %r12
- FUNC_EXIT()
+L(oop): movq (up,n,8), %r8
+ movq 8(up,n,8), %r12
+ HAM(` xorq (vp,n,8), %r8 ')
+ HAM(` xorq 8(vp,n,8), %r12 ')
+
+ movq %r8, %r9
+ movq %r12, %r13
+ shrq %r8
+ shrq %r12
+ andq h55555555, %r8
+ andq h55555555, %r12
+ subq %r8, %r9
+ subq %r12, %r13
+
+ movq %r9, %r8
+ movq %r13, %r12
+ shrq $2, %r9
+ shrq $2, %r13
+ andq h33333333, %r8
+ andq h33333333, %r9
+ andq h33333333, %r12
+ andq h33333333, %r13
+ addq %r8, %r9 C 16 4-bit fields (0..4)
+ addq %r12, %r13 C 16 4-bit fields (0..4)
+
+ addq %r13, %r9 C 16 4-bit fields (0..8)
+ movq %r9, %r8
+ shrq $4, %r9
+ andq h0f0f0f0f, %r8
+ andq h0f0f0f0f, %r9
+ addq %r8, %r9 C 8 8-bit fields (0..16)
+
+ imulq h01010101, %r9 C sum the 8 fields in high 8 bits
+ shrq $56, %r9
+
+ addq %r9, %rax C add to total
+ addq $2, n
+ jnc L(oop)
+
+L(done):
+ HAM(` popq %r14 ')
+ popq %r13
+ popq %r12
ret
+
EPILOGUE()
diff --git a/gmp/mpn/x86_64/redc_1.asm b/gmp/mpn/x86_64/redc_1.asm
new file mode 100644
index 0000000000..23ccceed67
--- /dev/null
+++ b/gmp/mpn/x86_64/redc_1.asm
@@ -0,0 +1,335 @@
+dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
+
+dnl Copyright 2004, 2008 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C cycles/limb
+C K8,K9: 2.5
+C K10: 2.5
+C P4: ?
+C P6-15 (Core2): 5.3
+C P6-28 (Atom): ?
+
+C TODO
+C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
+C The code for 1, 2, 3, 4 should perhaps be completely register based.
+C * Perhaps align outer loops.
+C * The sub_n at the end leaks side-channel data. How do we fix that?
+C * Write mpn_addsub_n computing R = A + B - C. It should run at 2 c/l.
+C * We could software pipeline the IMUL stuff, by putting it before the
+C outer loops and before the end of the outer loops. The last outer
+C loop iteration would then compute an unneeded product, but it is at
+C least not a stray read fro up[], since it is at up[n].
+C * Can we combine both the add_n and sub_n into the loops, somehow?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`param_mp',`%rdx')
+define(`n', `%rcx')
+define(`invm', `%r8')
+
+define(`mp', `%r13')
+define(`i', `%r11')
+define(`nneg', `%r12')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push n
+ sub $8, %rsp C maintain ABI required rsp alignment
+
+ lea (param_mp,n,8), mp C mp += n
+ lea (up,n,8), up C up += n
+
+ mov n, nneg
+ neg nneg
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): C lea (mp), mp
+ lea -16(up), up
+L(o1): mov nneg, i
+ mov 16(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov (mp,i,8), %rax
+ xor %ebx, %ebx
+ mul %rbp
+ add $1, i
+ jnz 1f
+ add %rax, 8(up,i,8)
+ adc $0, %rdx
+ mov %rdx, %r14
+ jmp L(n1)
+
+1: mov %rax, %r9
+ mov (mp,i,8), %rax
+ mov %rdx, %r14
+ jmp L(mi1)
+
+ ALIGN(16)
+L(lo1): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+L(mi1): xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo1)
+L(ed1): add %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 8(up)
+ adc $0, %r14
+L(n1): mov %r14, 16(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o1)
+C lea (mp), mp
+ lea 16(up), up
+ jmp L(common)
+
+L(b0): C lea (mp), mp
+ lea -16(up), up
+L(o0): mov nneg, i
+ mov 16(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov (mp,i,8), %rax
+ xor %r10d, %r10d
+ mul %rbp
+ mov %rax, %r14
+ mov %rdx, %rbx
+ jmp L(mi0)
+
+ ALIGN(16)
+L(lo0): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+L(mi0): mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo0)
+L(ed0): add %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 8(up)
+ adc $0, %r14
+ mov %r14, 16(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o0)
+C lea (mp), mp
+ lea 16(up), up
+ jmp L(common)
+
+
+L(b3): lea -8(mp), mp
+ lea -24(up), up
+L(o3): mov nneg, i
+ mov 24(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ mov %rax, %rbx
+ mov %rdx, %r10
+ jmp L(mi3)
+
+ ALIGN(16)
+L(lo3): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+L(mi3): mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo3)
+L(ed3): add %r10, 8(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 16(up)
+ adc $0, %r14
+ mov %r14, 24(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o3)
+ lea 8(mp), mp
+ lea 24(up), up
+ jmp L(common)
+
+L(b2): lea -16(mp), mp
+ lea -32(up), up
+L(o2): mov nneg, i
+ mov 32(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r14d, %r14d
+ mov %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %rdx, %r9
+ jmp L(mi2)
+
+ ALIGN(16)
+L(lo2): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+L(mi2): xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo2)
+L(ed2): add %r10, 16(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 24(up)
+ adc $0, %r14
+ mov %r14, 32(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o2)
+ lea 16(mp), mp
+ lea 32(up), up
+
+
+L(common):
+ lea (mp,nneg,8), mp C restore entry mp
+
+C cy = mpn_add_n (rp, up, up - n, n);
+C rdi rsi rdx rcx
+ lea (up,nneg,8), up C up -= n
+ lea (up,nneg,8), %rdx C rdx = up - n [up entry value]
+ mov rp, nneg C preserve rp over first call
+ mov 8(%rsp), %rcx C pass entry n
+C mov rp, %rdi
+ CALL( mpn_add_n)
+ test R32(%rax), R32(%rax)
+ jz L(ret)
+
+C mpn_sub_n (rp, rp, mp, n);
+C rdi rsi rdx rcx
+ mov nneg, %rdi
+ mov nneg, %rsi
+ mov mp, %rdx
+ mov 8(%rsp), %rcx C pass entry n
+ CALL( mpn_sub_n)
+
+L(ret):
+ add $8, %rsp
+ pop n C just increment rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/rsh1add_n.asm b/gmp/mpn/x86_64/rsh1add_n.asm
new file mode 100644
index 0000000000..0dd46f2c48
--- /dev/null
+++ b/gmp/mpn/x86_64/rsh1add_n.asm
@@ -0,0 +1,146 @@
+dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8,K9: 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C K10: 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C P4: 12.75
+C P6-15: 3.75
+
+C TODO
+C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
+C * Try to approach the cache bandwidth 1.5 c/l. It should be possible.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+define(`n32',`%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+ .byte 0,0,0,0,0,0,0,0
+PROLOGUE(mpn_rsh1add_n)
+ pushq %rbx C 1
+
+ xorl %eax, %eax
+ movq (up), %rbx
+ addq (vp), %rbx
+
+ rcrq %rbx C rotate, save acy
+ adcl %eax, %eax C return value
+
+ movl n32, %r11d
+ andl $3, %r11d
+
+ cmpl $1, %r11d
+ je L(do) C jump if n = 1 5 9 ...
+
+L(n1): cmpl $2, %r11d
+ jne L(n2) C jump unless n = 2 6 10 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r10
+ adcq 8(vp), %r10
+ leaq 8(up), up
+ leaq 8(vp), vp
+ leaq 8(rp), rp
+ rcrq %r10
+ rcrq %rbx
+ movq %rbx, -8(rp)
+ jmp L(cj1)
+
+L(n2): cmpl $3, %r11d
+ jne L(n3) C jump unless n = 3 7 11 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r9
+ movq 16(up), %r10
+ adcq 8(vp), %r9
+ adcq 16(vp), %r10
+ leaq 16(up), up
+ leaq 16(vp), vp
+ leaq 16(rp), rp
+ rcrq %r10
+ rcrq %r9
+ rcrq %rbx
+ movq %rbx, -16(rp)
+ jmp L(cj2)
+
+L(n3): decq n C come here for n = 4 8 12 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r8
+ movq 16(up), %r9
+ adcq 8(vp), %r8
+ adcq 16(vp), %r9
+ movq 24(up), %r10
+ adcq 24(vp), %r10
+ leaq 24(up), up
+ leaq 24(vp), vp
+ leaq 24(rp), rp
+ rcrq %r10
+ rcrq %r9
+ rcrq %r8
+ rcrq %rbx
+ movq %rbx, -24(rp)
+ movq %r8, -16(rp)
+L(cj2): movq %r9, -8(rp)
+L(cj1): movq %r10, %rbx
+
+L(do):
+ shrq $2, n C 4
+ je L(end) C 2
+ ALIGN(16)
+L(oop): addq %rbx, %rbx C rotate carry limb, restore acy
+
+ movq 8(up), %r8
+ movq 16(up), %r9
+ adcq 8(vp), %r8
+ adcq 16(vp), %r9
+ movq 24(up), %r10
+ movq 32(up), %r11
+ adcq 24(vp), %r10
+ adcq 32(vp), %r11
+
+ leaq 32(up), up
+ leaq 32(vp), vp
+
+ rcrq %r11 C rotate, save acy
+ rcrq %r10
+ rcrq %r9
+ rcrq %r8
+
+ rcrq %rbx
+ movq %rbx, (rp)
+ movq %r8, 8(rp)
+ movq %r9, 16(rp)
+ movq %r10, 24(rp)
+ movq %r11, %rbx
+
+ leaq 32(rp), rp
+ decq n
+ jne L(oop)
+
+L(end): movq %rbx, (rp)
+ popq %rbx
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/rsh1aors_n.asm b/gmp/mpn/x86_64/rsh1aors_n.asm
deleted file mode 100644
index a3e9cc5d23..0000000000
--- a/gmp/mpn/x86_64/rsh1aors_n.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
-dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
-
-dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125)
-C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125)
-C Intel P4 12.75
-C Intel core2 3.75
-C Intel NMH 4.4
-C Intel SBR ?
-C Intel atom ?
-C VIA nano 3.25
-
-C TODO
-C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
-
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n',` %rcx')
-
-ifdef(`OPERATION_rsh1add_n', `
- define(ADDSUB, add)
- define(ADCSBB, adc)
- define(func_n, mpn_rsh1add_n)
- define(func_nc, mpn_rsh1add_nc)')
-ifdef(`OPERATION_rsh1sub_n', `
- define(ADDSUB, sub)
- define(ADCSBB, sbb)
- define(func_n, mpn_rsh1sub_n)
- define(func_nc, mpn_rsh1sub_nc)')
-
-MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(func_nc)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8 ')
- push %rbx
-
- xor R32(%rax), R32(%rax)
- neg %r8 C set C flag from parameter
- mov (up), %rbx
- ADCSBB (vp), %rbx
- jmp L(ent)
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(func_n)
- FUNC_ENTRY(4)
- push %rbx
-
- xor R32(%rax), R32(%rax)
- mov (up), %rbx
- ADDSUB (vp), %rbx
-L(ent):
- rcr %rbx C rotate, save acy
- adc R32(%rax), R32(%rax) C return value
-
- mov R32(n), R32(%r11)
- and $3, R32(%r11)
-
- cmp $1, R32(%r11)
- je L(do) C jump if n = 1 5 9 ...
-
-L(n1): cmp $2, R32(%r11)
- jne L(n2) C jump unless n = 2 6 10 ...
- add %rbx, %rbx C rotate carry limb, restore acy
- mov 8(up), %r10
- ADCSBB 8(vp), %r10
- lea 8(up), up
- lea 8(vp), vp
- lea 8(rp), rp
- rcr %r10
- rcr %rbx
- mov %rbx, -8(rp)
- jmp L(cj1)
-
-L(n2): cmp $3, R32(%r11)
- jne L(n3) C jump unless n = 3 7 11 ...
- add %rbx, %rbx C rotate carry limb, restore acy
- mov 8(up), %r9
- mov 16(up), %r10
- ADCSBB 8(vp), %r9
- ADCSBB 16(vp), %r10
- lea 16(up), up
- lea 16(vp), vp
- lea 16(rp), rp
- rcr %r10
- rcr %r9
- rcr %rbx
- mov %rbx, -16(rp)
- jmp L(cj2)
-
-L(n3): dec n C come here for n = 4 8 12 ...
- add %rbx, %rbx C rotate carry limb, restore acy
- mov 8(up), %r8
- mov 16(up), %r9
- ADCSBB 8(vp), %r8
- ADCSBB 16(vp), %r9
- mov 24(up), %r10
- ADCSBB 24(vp), %r10
- lea 24(up), up
- lea 24(vp), vp
- lea 24(rp), rp
- rcr %r10
- rcr %r9
- rcr %r8
- rcr %rbx
- mov %rbx, -24(rp)
- mov %r8, -16(rp)
-L(cj2): mov %r9, -8(rp)
-L(cj1): mov %r10, %rbx
-
-L(do):
- shr $2, n C 4
- je L(end) C 2
- ALIGN(16)
-L(top): add %rbx, %rbx C rotate carry limb, restore acy
-
- mov 8(up), %r8
- mov 16(up), %r9
- ADCSBB 8(vp), %r8
- ADCSBB 16(vp), %r9
- mov 24(up), %r10
- mov 32(up), %r11
- ADCSBB 24(vp), %r10
- ADCSBB 32(vp), %r11
-
- lea 32(up), up
- lea 32(vp), vp
-
- rcr %r11 C rotate, save acy
- rcr %r10
- rcr %r9
- rcr %r8
-
- rcr %rbx
- mov %rbx, (rp)
- mov %r8, 8(rp)
- mov %r9, 16(rp)
- mov %r10, 24(rp)
- mov %r11, %rbx
-
- lea 32(rp), rp
- dec n
- jne L(top)
-
-L(end): mov %rbx, (rp)
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/rsh1sub_n.asm b/gmp/mpn/x86_64/rsh1sub_n.asm
new file mode 100644
index 0000000000..b08bba4735
--- /dev/null
+++ b/gmp/mpn/x86_64/rsh1sub_n.asm
@@ -0,0 +1,146 @@
+dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
+
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8,K9: 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C K10: 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C P4: 12.75
+C P6-15: 3.75
+
+C TODO
+C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
+C * Try to approach the cache bandwidth 1.5 c/l. It should be possible.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+define(`n32',`%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+ .byte 0,0,0,0,0,0,0,0
+PROLOGUE(mpn_rsh1sub_n)
+ pushq %rbx C 1
+
+ xorl %eax, %eax
+ movq (up), %rbx
+ subq (vp), %rbx
+
+ rcrq %rbx C rotate, save acy
+ adcl %eax, %eax C return value
+
+ movl n32, %r11d
+ andl $3, %r11d
+
+ cmpl $1, %r11d
+ je L(do) C jump if n = 1 5 9 ...
+
+L(n1): cmpl $2, %r11d
+ jne L(n2) C jump unless n = 2 6 10 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r10
+ sbbq 8(vp), %r10
+ leaq 8(up), up
+ leaq 8(vp), vp
+ leaq 8(rp), rp
+ rcrq %r10
+ rcrq %rbx
+ movq %rbx, -8(rp)
+ jmp L(cj1)
+
+L(n2): cmpl $3, %r11d
+ jne L(n3) C jump unless n = 3 7 11 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r9
+ movq 16(up), %r10
+ sbbq 8(vp), %r9
+ sbbq 16(vp), %r10
+ leaq 16(up), up
+ leaq 16(vp), vp
+ leaq 16(rp), rp
+ rcrq %r10
+ rcrq %r9
+ rcrq %rbx
+ movq %rbx, -16(rp)
+ jmp L(cj2)
+
+L(n3): decq n C come here for n = 4 8 12 ...
+ addq %rbx, %rbx C rotate carry limb, restore acy
+ movq 8(up), %r8
+ movq 16(up), %r9
+ sbbq 8(vp), %r8
+ sbbq 16(vp), %r9
+ movq 24(up), %r10
+ sbbq 24(vp), %r10
+ leaq 24(up), up
+ leaq 24(vp), vp
+ leaq 24(rp), rp
+ rcrq %r10
+ rcrq %r9
+ rcrq %r8
+ rcrq %rbx
+ movq %rbx, -24(rp)
+ movq %r8, -16(rp)
+L(cj2): movq %r9, -8(rp)
+L(cj1): movq %r10, %rbx
+
+L(do):
+ shrq $2, n C 4
+ je L(end) C 2
+ ALIGN(16)
+L(oop): addq %rbx, %rbx C rotate carry limb, restore acy
+
+ movq 8(up), %r8
+ movq 16(up), %r9
+ sbbq 8(vp), %r8
+ sbbq 16(vp), %r9
+ movq 24(up), %r10
+ movq 32(up), %r11
+ sbbq 24(vp), %r10
+ sbbq 32(vp), %r11
+
+ leaq 32(up), up
+ leaq 32(vp), vp
+
+ rcrq %r11 C rotate, save acy
+ rcrq %r10
+ rcrq %r9
+ rcrq %r8
+
+ rcrq %rbx
+ movq %rbx, (rp)
+ movq %r8, 8(rp)
+ movq %r9, 16(rp)
+ movq %r10, 24(rp)
+ movq %r11, %rbx
+
+ leaq 32(rp), rp
+ decq n
+ jne L(oop)
+
+L(end): movq %rbx, (rp)
+ popq %rbx
+ ret
+EPILOGUE()
diff --git a/gmp/mpn/x86_64/rshift.asm b/gmp/mpn/x86_64/rshift.asm
index 3f344f1dfc..8979d29ea8 100644
--- a/gmp/mpn/x86_64/rshift.asm
+++ b/gmp/mpn/x86_64/rshift.asm
@@ -1,44 +1,31 @@
-dnl AMD64 mpn_rshift -- mpn right shift.
+dnl AMD64 mpn_rshift -- mpn left shift.
-dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
+dnl Copyright 2003, 2005 Free Software Foundation, Inc.
dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.375
-C AMD K10 2.375
-C Intel P4 8
-C Intel core2 2.11
-C Intel corei ?
-C Intel atom 5.75
-C VIA nano 3.5
+C K8,K9: 2.375
+C K10: 2.375
+C P4: 8
+C P6-15 (Core2): 2.11
+C P6-28 (Atom): 5.75
C INPUT PARAMETERS
@@ -47,18 +34,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_rshift)
- FUNC_ENTRY(4)
- neg R32(%rcx) C put rsh count in cl
+ neg %ecx C put rsh count in cl
mov (up), %rax
- shl R8(%rcx), %rax C function return value
- neg R32(%rcx) C put lsh count in cl
+ shl %cl, %rax C function return value
+ neg %ecx C put lsh count in cl
lea 1(n), R32(%r8)
@@ -73,10 +56,10 @@ PROLOGUE(mpn_rshift)
jne L(1)
C n = 4, 8, 12, ...
mov 8(up,n,8), %r10
- shr R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
+ shr %cl, %r10
+ neg %ecx C put rsh count in cl
mov 16(up,n,8), %r8
- shl R8(%rcx), %r8
+ shl %cl, %r8
or %r8, %r10
mov %r10, 8(rp,n,8)
inc n
@@ -86,91 +69,90 @@ L(1): dec R32(%r8)
je L(1x) C jump for n = 1, 5, 9, 13, ...
C n = 2, 6, 10, 16, ...
mov 8(up,n,8), %r10
- shr R8(%rcx), %r10
- neg R32(%rcx) C put rsh count in cl
+ shr %cl, %r10
+ neg %ecx C put rsh count in cl
mov 16(up,n,8), %r8
- shl R8(%rcx), %r8
+ shl %cl, %r8
or %r8, %r10
mov %r10, 8(rp,n,8)
inc n
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
L(1x):
cmp $-1, n
je L(ast)
mov 8(up,n,8), %r10
- shr R8(%rcx), %r10
+ shr %cl, %r10
mov 16(up,n,8), %r11
- shr R8(%rcx), %r11
- neg R32(%rcx) C put rsh count in cl
+ shr %cl, %r11
+ neg %ecx C put rsh count in cl
mov 16(up,n,8), %r8
mov 24(up,n,8), %r9
- shl R8(%rcx), %r8
+ shl %cl, %r8
or %r8, %r10
- shl R8(%rcx), %r9
+ shl %cl, %r9
or %r9, %r11
mov %r10, 8(rp,n,8)
mov %r11, 16(rp,n,8)
add $2, n
-L(rll): neg R32(%rcx) C put lsh count in cl
+L(rll): neg %ecx C put lsh count in cl
L(rlx): mov 8(up,n,8), %r10
- shr R8(%rcx), %r10
+ shr %cl, %r10
mov 16(up,n,8), %r11
- shr R8(%rcx), %r11
+ shr %cl, %r11
add $4, n C 4
jb L(end) C 2
ALIGN(16)
L(top):
C finish stuff from lsh block
- neg R32(%rcx) C put rsh count in cl
+ neg %ecx C put rsh count in cl
mov -16(up,n,8), %r8
mov -8(up,n,8), %r9
- shl R8(%rcx), %r8
+ shl %cl, %r8
or %r8, %r10
- shl R8(%rcx), %r9
+ shl %cl, %r9
or %r9, %r11
mov %r10, -24(rp,n,8)
mov %r11, -16(rp,n,8)
C start two new rsh
mov (up,n,8), %r8
mov 8(up,n,8), %r9
- shl R8(%rcx), %r8
- shl R8(%rcx), %r9
+ shl %cl, %r8
+ shl %cl, %r9
C finish stuff from rsh block
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
mov -8(up,n,8), %r10
mov 0(up,n,8), %r11
- shr R8(%rcx), %r10
+ shr %cl, %r10
or %r10, %r8
- shr R8(%rcx), %r11
+ shr %cl, %r11
or %r11, %r9
mov %r8, -8(rp,n,8)
mov %r9, 0(rp,n,8)
C start two new lsh
mov 8(up,n,8), %r10
mov 16(up,n,8), %r11
- shr R8(%rcx), %r10
- shr R8(%rcx), %r11
+ shr %cl, %r10
+ shr %cl, %r11
add $4, n
jae L(top) C 2
L(end):
- neg R32(%rcx) C put rsh count in cl
- mov -8(up), %r8
- shl R8(%rcx), %r8
+ neg %ecx C put rsh count in cl
+ mov -16(up,n,8), %r8
+ shl %cl, %r8
or %r8, %r10
- mov (up), %r9
- shl R8(%rcx), %r9
+ mov -8(up,n,8), %r9
+ shl %cl, %r9
or %r9, %r11
- mov %r10, -16(rp)
- mov %r11, -8(rp)
+ mov %r10, -24(rp,n,8)
+ mov %r11, -16(rp,n,8)
- neg R32(%rcx) C put lsh count in cl
+ neg %ecx C put lsh count in cl
L(ast): mov (up), %r10
- shr R8(%rcx), %r10
+ shr %cl, %r10
mov %r10, (rp)
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/sec_tabselect.asm b/gmp/mpn/x86_64/sec_tabselect.asm
deleted file mode 100644
index e8aed261ef..0000000000
--- a/gmp/mpn/x86_64/sec_tabselect.asm
+++ /dev/null
@@ -1,176 +0,0 @@
-dnl AMD64 mpn_sec_tabselect.
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb good for cpu
-C AMD K8,K9 1.5 Y
-C AMD K10 1.4
-C AMD bd1 2.64
-C AMD bobcat 2.15 Y
-C Intel P4 4
-C Intel core2 1.38
-C Intel NHM 1.75
-C Intel SBR 1.25
-C Intel atom 2.5 Y
-C VIA nano 1.75 Y
-
-C NOTES
-C * This has not been tuned for any specific processor. Its speed should not
-C be too bad, though.
-C * Using SSE2/AVX2 could result in many-fold speedup.
-C * WORKS FOR n mod 4 = 0 ONLY!
-
-C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
-define(`rp', `%rdi')
-define(`tp', `%rsi')
-define(`n', `%rdx')
-define(`nents', `%rcx')
-define(`which', `%r8')
-
-define(`i', `%rbp')
-define(`j', `%r9')
-
-C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
-C nents n rp tab i which j * * * * * *
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(16)
-PROLOGUE(mpn_sec_tabselect)
- FUNC_ENTRY(4)
-IFDOS(` mov 56(%rsp), %r8d ')
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov n, j
- add $-4, j
- js L(outer_end)
-
-L(outer_top):
- mov nents, i
- push tp
- xor R32(%r12), R32(%r12)
- xor R32(%r13), R32(%r13)
- xor R32(%r14), R32(%r14)
- xor R32(%r15), R32(%r15)
- mov which, %rbx
-
- ALIGN(16)
-L(top): sub $1, %rbx
- sbb %rax, %rax
- mov 0(tp), %r10
- mov 8(tp), %r11
- and %rax, %r10
- and %rax, %r11
- or %r10, %r12
- or %r11, %r13
- mov 16(tp), %r10
- mov 24(tp), %r11
- and %rax, %r10
- and %rax, %r11
- or %r10, %r14
- or %r11, %r15
- lea (tp,n,8), tp
- add $-1, i
- jne L(top)
-
- mov %r12, 0(rp)
- mov %r13, 8(rp)
- mov %r14, 16(rp)
- mov %r15, 24(rp)
- pop tp
- lea 32(tp), tp
- lea 32(rp), rp
- add $-4, j
- jns L(outer_top)
-L(outer_end):
-
- test $2, R8(n)
- jz L(b0x)
-L(b1x): mov nents, i
- push tp
- xor R32(%r12), R32(%r12)
- xor R32(%r13), R32(%r13)
- mov which, %rbx
- ALIGN(16)
-L(tp2): sub $1, %rbx
- sbb %rax, %rax
- mov 0(tp), %r10
- mov 8(tp), %r11
- and %rax, %r10
- and %rax, %r11
- or %r10, %r12
- or %r11, %r13
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp2)
- mov %r12, 0(rp)
- mov %r13, 8(rp)
- pop tp
- lea 16(tp), tp
- lea 16(rp), rp
-
-L(b0x): test $1, R8(n)
- jz L(b00)
-L(b01): mov nents, i
- xor R32(%r12), R32(%r12)
- mov which, %rbx
- ALIGN(16)
-L(tp1): sub $1, %rbx
- sbb %rax, %rax
- mov 0(tp), %r10
- and %rax, %r10
- or %r10, %r12
- lea (tp,n,8), tp
- add $-1, i
- jne L(tp1)
- mov %r12, 0(rp)
-
-L(b00): pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/k8/sqr_basecase.asm b/gmp/mpn/x86_64/sqr_basecase.asm
index 60cf945a46..3ed4be1269 100644
--- a/gmp/mpn/x86_64/k8/sqr_basecase.asm
+++ b/gmp/mpn/x86_64/sqr_basecase.asm
@@ -2,33 +2,22 @@ dnl AMD64 mpn_sqr_basecase.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2008, 2009 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
@@ -36,21 +25,14 @@ C The inner loops of this code are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C NOTES
-C * There is a major stupidity in that we call mpn_mul_1 initially, for a
-C large trip count. Instead, we should follow the generic/sqr_basecase.c
-C code which uses addmul_2s from the start, conditionally leaving a 1x1
-C multiply to the end. (In assembly code, one would stop invoking
-C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
-C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
-C save/restore carry, instead it can propagate into the high product word.
-C * Align more labels, should shave off a few cycles.
-C * We can safely use 32-bit size operations, since operands with (2^32)
-C limbs will lead to non-termination in practice.
+C * This code only handles operands up to SQR_KARATSUBA_THRESHOLD_MAX. That
+C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
+C mpn_addmul_1.
C * The jump table could probably be optimized, at least for non-pic.
-C * The special code for n <= 4 was quickly written. It is probably too
+C * The special code for n=1,2,3 was quickly written. It is probably too
C large and unnecessarily slow.
-C * Consider combining small cases code so that the n=k-1 code jumps into the
-C middle of the n=k code.
+C * Consider combining small cases code so that the n=k-1 code jumps into
+C the middle of the n=k code.
C * Avoid saving registers for small cases code.
C * Needed variables:
C n r11 input size
@@ -75,6 +57,12 @@ define(`rp', `%rdi')
define(`up', `%rsi')
define(`n_param', `%rdx')
+C We should really trim this, for better spatial locality. Alternatively,
+C we could grab the upper part of the stack area, leaving the lower part
+C instead of the upper part unused.
+define(`SQR_KARATSUBA_THRESHOLD_MAX', 120)
+define(`STACK_ALLOC', eval(8*2*SQR_KARATSUBA_THRESHOLD_MAX))
+
define(`n', `%r11')
define(`tp', `%r12')
define(`i', `%r8')
@@ -86,137 +74,125 @@ define(`w1', `%rcx')
define(`w2', `%rbp')
define(`w3', `%r10')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
+define(`SPECIAL_CODE_FOR_4',1)
+
ASM_START()
TEXT
ALIGN(16)
-PROLOGUE(mpn_sqr_basecase)
- FUNC_ENTRY(3)
- mov R32(n_param), R32(%rcx)
- mov R32(n_param), R32(n) C free original n register (rdx)
- add $-40, %rsp
+PROLOGUE(mpn_sqr_basecase)
+ add $-48, %rsp
+ mov %rbx, 40(%rsp)
+ mov %rbp, 32(%rsp)
+ mov %r12, 24(%rsp)
+ mov %r13, 16(%rsp)
+ mov %r14, 8(%rsp)
+ mov R32(n_param), R32(n) C free original n register (rdx)
+ mov R32(n_param), R32(%rcx)
and $3, R32(%rcx)
+ lea 4(%rcx), %rbx
cmp $4, R32(n_param)
- lea 4(%rcx), %r8
-
- mov %rbx, 32(%rsp)
- mov %rbp, 24(%rsp)
- mov %r12, 16(%rsp)
- mov %r13, 8(%rsp)
- mov %r14, (%rsp)
-
- cmovg %r8, %rcx
-
- lea L(tab)(%rip), %rax
-ifdef(`PIC',
-` movslq (%rax,%rcx,4), %r10
- add %r10, %rax
- jmp *%rax
-',`
+ cmovg %rbx, %rcx
+ lea L(jmptab)(%rip), %rax
jmp *(%rax,%rcx,8)
-')
JUMPTABSECT
ALIGN(8)
-L(tab): JMPENT( L(4), L(tab))
- JMPENT( L(1), L(tab))
- JMPENT( L(2), L(tab))
- JMPENT( L(3), L(tab))
- JMPENT( L(0m4), L(tab))
- JMPENT( L(1m4), L(tab))
- JMPENT( L(2m4), L(tab))
- JMPENT( L(3m4), L(tab))
+L(jmptab):
+ .quad L(4)
+ .quad L(1)
+ .quad L(2)
+ .quad L(3)
+ .quad L(0m4)
+ .quad L(1m4)
+ .quad L(2m4)
+ .quad L(3m4)
TEXT
L(1): mov (up), %rax
mul %rax
- add $40, %rsp
mov %rax, (rp)
mov %rdx, 8(rp)
- FUNC_EXIT()
+ add $40, %rsp
+ pop %rbx
ret
L(2): mov (up), %rax
- mov %rax, %r8
mul %rax
- mov 8(up), %r11
mov %rax, (rp)
- mov %r11, %rax
mov %rdx, %r9
+ mov 8(up), %rax
mul %rax
- add $40, %rsp
mov %rax, %r10
- mov %r11, %rax
mov %rdx, %r11
- mul %r8
- xor %r8, %r8
+ mov 8(up), %rax
+ mov (up), %rbx
+ mul %rbx
add %rax, %r9
adc %rdx, %r10
- adc %r8, %r11
+ adc $0, %r11
add %rax, %r9
mov %r9, 8(rp)
adc %rdx, %r10
mov %r10, 16(rp)
- adc %r8, %r11
+ adc $0, %r11
mov %r11, 24(rp)
- FUNC_EXIT()
+ add $40, %rsp
+ pop %rbx
ret
L(3): mov (up), %rax
- mov %rax, %r10
mul %rax
- mov 8(up), %r11
mov %rax, (rp)
- mov %r11, %rax
mov %rdx, 8(rp)
+ mov 8(up), %rax
mul %rax
- mov 16(up), %rcx
mov %rax, 16(rp)
- mov %rcx, %rax
mov %rdx, 24(rp)
+ mov 16(up), %rax
mul %rax
mov %rax, 32(rp)
mov %rdx, 40(rp)
- mov %r11, %rax
- mul %r10
+ mov (up), %rbx
+ mov 8(up), %rax
+ mul %rbx
mov %rax, %r8
- mov %rcx, %rax
mov %rdx, %r9
- mul %r10
- xor %r10, %r10
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r10), R32(%r10)
add %rax, %r9
- mov %r11, %rax
- mov %r10, %r11
adc %rdx, %r10
- mul %rcx
- add $40, %rsp
+ mov 8(up), %rbx
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r11), R32(%r11)
add %rax, %r10
- adc %r11, %rdx
+ adc %rdx, %r11
add %r8, %r8
adc %r9, %r9
adc %r10, %r10
- adc %rdx, %rdx
adc %r11, %r11
+ mov $0, R32(%rbx)
+ adc %rbx, %rbx
add %r8, 8(rp)
adc %r9, 16(rp)
adc %r10, 24(rp)
- adc %rdx, 32(rp)
- adc %r11, 40(rp)
- FUNC_EXIT()
+ adc %r11, 32(rp)
+ adc %rbx, 40(rp)
+ add $40, %rsp
+ pop %rbx
ret
+ifdef(`SPECIAL_CODE_FOR_4',`
L(4): mov (up), %rax
- mov %rax, %r11
mul %rax
- mov 8(up), %rbx
mov %rax, (rp)
- mov %rbx, %rax
mov %rdx, 8(rp)
+ mov 8(up), %rax
mul %rax
mov %rax, 16(rp)
mov %rdx, 24(rp)
@@ -227,71 +203,77 @@ L(4): mov (up), %rax
mov 24(up), %rax
mul %rax
mov %rax, 48(rp)
- mov %rbx, %rax
mov %rdx, 56(rp)
- mul %r11
- add $32, %rsp
+ mov (up), %rbx
+ mov 8(up), %rax
+ mul %rbx
mov %rax, %r8
mov %rdx, %r9
mov 16(up), %rax
- mul %r11
- xor %r10, %r10
+ mul %rbx
+ xor R32(%r10), R32(%r10)
add %rax, %r9
adc %rdx, %r10
mov 24(up), %rax
- mul %r11
- xor %r11, %r11
+ mul %rbx
+ xor R32(%r11), R32(%r11)
add %rax, %r10
adc %rdx, %r11
+ mov 8(up), %rbx
mov 16(up), %rax
mul %rbx
- xor %rcx, %rcx
+ xor R32(%r12), R32(%r12)
add %rax, %r10
adc %rdx, %r11
- adc $0, %rcx
+ adc $0, %r12
mov 24(up), %rax
mul %rbx
- pop %rbx
add %rax, %r11
- adc %rdx, %rcx
- mov 16(up), %rdx
+ adc %rdx, %r12
+ mov 16(up), %rbx
mov 24(up), %rax
- mul %rdx
- add %rax, %rcx
- adc $0, %rdx
+ mul %rbx
+ xor R32(%rbp), R32(%rbp)
+ add %rax, %r12
+ adc %rdx, %rbp
add %r8, %r8
adc %r9, %r9
adc %r10, %r10
adc %r11, %r11
- adc %rcx, %rcx
- mov $0, R32(%rax)
- adc %rdx, %rdx
+ adc %r12, %r12
+ mov $0, R32(%rbx)
+ adc %rbp, %rbp
- adc %rax, %rax
+ adc %rbx, %rbx
add %r8, 8(rp)
adc %r9, 16(rp)
adc %r10, 24(rp)
adc %r11, 32(rp)
- adc %rcx, 40(rp)
- adc %rdx, 48(rp)
- adc %rax, 56(rp)
- FUNC_EXIT()
+ adc %r12, 40(rp)
+ adc %rbp, 48(rp)
+ adc %rbx, 56(rp)
+ add $24, %rsp
+ pop %r12
+ pop %rbp
+ pop %rbx
ret
+')
-
-L(0m4):
- lea -16(rp,n,8), tp C point tp in middle of result operand
- mov (up), v0
- mov 8(up), %rax
+L(0m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
- lea -4(n), i
+ lea -1(n), i
C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
- xor R32(j), R32(j)
- sub n, j
+ mov $-1, j
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ mov (up,j,8), v0
+ mov 8(up,j,8), %rax
mul v0
xor R32(w2), R32(w2)
mov %rax, w0
@@ -333,28 +315,31 @@ L(L3): xor R32(w1), R32(w1)
adc %rdx, w1
mov w2, 8(tp)
mov w1, 16(tp)
-
- lea eval(2*8)(tp), tp C tp += 2
- lea -8(up), up
+ lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
+ifdef(`SPECIAL_CODE_FOR_4',`',`
+ cmp $3, R32(i)
+ je L(last)
+')
jmp L(dowhile)
+L(1m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
-L(1m4):
- lea 8(rp,n,8), tp C point tp in middle of result operand
- mov (up), v0 C u0
- mov 8(up), %rax C u1
- lea 8(up,n,8), up C point up at end of input operand
-
- lea -3(n), i
+ lea (n), i
C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
- lea -3(n), j
- neg j
+ mov $3, R32(j)
+ sub i, j
+
+ lea 8(up), up C offset FIXME
- mov %rax, v1 C u1
+ mov -32(up,j,8), v0 C u0
+ mov -24(up,j,8), v1 C u1
+ mov -24(up,j,8), %rax C u1
mul v0 C u0 * u1
mov %rdx, w1
xor R32(w2), R32(w2)
- mov %rax, 8(rp)
+ mov %rax, -24(tp,j,8)
jmp L(m0)
ALIGN(16)
@@ -396,7 +381,7 @@ L(m0): mov -16(up,j,8), %rax C u2, u6 ...
add %rax, w3
mov w2, -8(tp,j,8)
adc %rdx, w0
-L(m2x): mov (up,j,8), %rax
+ mov (up,j,8), %rax
mul v0
add %rax, w3
adc %rdx, w0
@@ -412,22 +397,28 @@ L(m2x): mov (up,j,8), %rax
mov w0, -8(tp)
mov w1, (tp)
- lea -16(up), up
- lea eval(3*8-24)(tp), tp C tp += 3
- jmp L(dowhile_end)
+ lea -8(up), up C undo offset FIXME
+ lea eval(3*8)(tp), tp C tp += 3
+ add $-2, R32(i) C i -= 2
+ cmp $3, R32(i)
+ je L(last)
+ jmp L(dowhile)
-L(2m4):
- lea -16(rp,n,8), tp C point tp in middle of result operand
- mov (up), v0
- mov 8(up), %rax
+
+L(2m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
lea (up,n,8), up C point up at end of input operand
- lea -4(n), i
+ lea -1(n), i
C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
- lea -2(n), j
- neg j
+ mov $1, R32(j)
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ mov -16(up,j,8), v0
+ mov -8(up,j,8), %rax
mul v0
mov %rax, w2
mov (up,j,8), %rax
@@ -469,28 +460,30 @@ L(L1): xor R32(w0), R32(w0)
mov w2, 8(tp)
mov w1, 16(tp)
- lea eval(2*8)(tp), tp C tp += 2
- lea -8(up), up
+ lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
jmp L(dowhile_mid)
-L(3m4):
- lea 8(rp,n,8), tp C point tp in middle of result operand
- mov (up), v0 C u0
- mov 8(up), %rax C u1
- lea 8(up,n,8), up C point up at end of input operand
- lea -5(n), i
+L(3m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
+
+ lea (n), i
C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
- lea -1(n), j
- neg j
+ mov $1, R32(j)
+ sub i, j
- mov %rax, v1 C u1
- mul v0 C u0 * u1
+ lea 8(up), up C offset FIXME
+
+ mov -16(up,j,8), v0
+ mov -8(up,j,8), v1
+ mov -8(up,j,8), %rax
+ mul v0 C v0 * u0
mov %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
- mov %rax, 8(rp)
+ mov %rax, -8(tp,j,8)
jmp L(m2)
ALIGN(16)
@@ -548,13 +541,18 @@ L(m2): mov (up,j,8), %rax
mov w0, -8(tp)
mov w1, (tp)
- lea -16(up), up
+ lea -8(up), up C undo offset FIXME
+ lea eval(3*8)(tp), tp C tp += 3
+ add $-2, R32(i) C i -= 2
jmp L(dowhile_mid)
L(dowhile):
C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
- lea 4(i), j
- neg j
+ mov $-1, j
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ lea -8(up), up C offset FIXME
mov 16(up,j,8), v0
mov 24(up,j,8), v1
@@ -623,13 +621,18 @@ L(am2): mov 32(up,j,8), %rax
mov w1, 16(tp)
lea eval(2*8)(tp), tp C tp += 2
-
add $-2, R32(i) C i -= 2
+ lea 24(tp), tp C undo offset FIXME
+ lea 8(up), up C undo offset FIXME
+
L(dowhile_mid):
C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
- lea 2(i), j
- neg j
+ mov $1, R32(j)
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ lea -8(up), up C offset FIXME
mov (up,j,8), v0
mov 8(up,j,8), v1
@@ -696,57 +699,74 @@ L(20): mov 16(up,j,8), %rax
mov w0, 8(tp)
mov w1, 16(tp)
- lea eval(2*8)(tp), tp C tp += 2
-L(dowhile_end):
+ lea 24(tp), tp C undo offset FIXME
+ lea 8(up), up C undo offset FIXME
+ lea eval(2*8)(tp), tp C tp += 2
add $-2, R32(i) C i -= 2
+
+ cmp $3, R32(i)
jne L(dowhile)
+L(last):
+
C Function mpn_addmul_2s_2
- mov -16(up), v0
- mov -8(up), v1
- mov -8(up), %rax
+ mov -24(up), v0
+ mov -16(up), v1
+ mov -16(up), %rax
mul v0
xor R32(w3), R32(w3)
- add %rax, -8(tp)
+ add %rax, -32(tp)
adc %rdx, w3
xor R32(w0), R32(w0)
xor R32(w1), R32(w1)
- mov (up), %rax
+ mov -8(up), %rax
mul v0
add %rax, w3
- mov (up), %rax
+ mov -8(up), %rax
adc %rdx, w0
mul v1
- add w3, (tp)
+ add w3, -24(tp)
adc %rax, w0
adc %rdx, w1
- mov w0, 8(tp)
- mov w1, 16(tp)
+ mov w0, -16(tp)
+ mov w1, -8(tp)
C Function mpn_sqr_diag_addlsh1
- lea -4(n,n), j
+ mov R32(n), R32(j)
+ shl $3, n
+ sub n, up
+
+ mov (%rsp), %r11
- mov 8(rp), %r11
- lea -8(up), up
- lea (rp,j,8), rp
+ bt $0, j
+ lea -4(j,j),j
+ jc L(odd)
+
+L(evn): lea (rp,j,8), rp
+ lea (up,j,4), up
+ lea 8(%rsp,j,8), tp
neg j
- mov (up,j,4), %rax
- mul %rax
- test $2, R8(j)
- jnz L(odd)
-L(evn): add %r11, %r11
+ add %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
+ mov (up,j,4), %rax
+ mul %rax
add %rdx, %r11
mov %rax, (rp,j,8)
jmp L(d0)
-L(odd): add %r11, %r11
+L(odd): lea -16(rp,j,8), rp
+ lea -8(up,j,4), up
+ lea -8(%rsp,j,8), tp
+ neg j
+
+ add %r11, %r11
sbb R32(%rbp), R32(%rbp) C save CF
+ mov 8(up,j,4), %rax
+ mul %rax
add %rdx, %r11
- mov %rax, (rp,j,8)
- lea -2(j), j
+ mov %rax, 16(rp,j,8)
jmp L(d1)
ALIGN(16)
@@ -757,9 +777,9 @@ L(top): mov (up,j,4), %rax
adc %rdx, %r11
mov %r10, (rp,j,8)
L(d0): mov %r11, 8(rp,j,8)
- mov 16(rp,j,8), %r10
+ mov (tp,j,8), %r10
adc %r10, %r10
- mov 24(rp,j,8), %r11
+ mov 8(tp,j,8), %r11
adc %r11, %r11
nop
sbb R32(%rbp), R32(%rbp) C save CF
@@ -770,38 +790,38 @@ L(d0): mov %r11, 8(rp,j,8)
adc %rdx, %r11
mov %r10, 16(rp,j,8)
L(d1): mov %r11, 24(rp,j,8)
- mov 32(rp,j,8), %r10
+ mov 16(tp,j,8), %r10
adc %r10, %r10
- mov 40(rp,j,8), %r11
+ mov 24(tp,j,8), %r11
adc %r11, %r11
sbb R32(%rbx), R32(%rbx) C save CF
add $4, j
js L(top)
- mov (up), %rax
+L(end): mov (up,j,4), %rax
mul %rax
add R32(%rbp), R32(%rbp) C restore carry
adc %rax, %r10
adc %rdx, %r11
- mov %r10, (rp)
- mov %r11, 8(rp)
- mov 16(rp), %r10
+ mov %r10, (rp,j,8)
+ mov %r11, 8(rp,j,8)
+ mov (tp,j,8), %r10
adc %r10, %r10
sbb R32(%rbp), R32(%rbp) C save CF
neg R32(%rbp)
- mov 8(up), %rax
+ mov 8(up,j,4), %rax
mul %rax
add R32(%rbx), R32(%rbx) C restore carry
adc %rax, %r10
adc %rbp, %rdx
- mov %r10, 16(rp)
- mov %rdx, 24(rp)
+ mov %r10, 16(rp,j,8)
+ mov %rdx, 24(rp,j,8)
+ add $eval(8+STACK_ALLOC), %rsp
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/sqr_diag_addlsh1.asm b/gmp/mpn/x86_64/sqr_diag_addlsh1.asm
deleted file mode 100644
index 4ad034c855..0000000000
--- a/gmp/mpn/x86_64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,116 +0,0 @@
-dnl AMD64 mpn_sqr_diag_addlsh1
-
-dnl Contributed to the GNU project by Torbjörn Granlund.
-
-dnl Copyright 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C AMD K8,K9 2.5
-C AMD K10 2.5
-C AMD bull 3.6
-C AMD pile 3.6
-C AMD steam ?
-C AMD bobcat 4
-C AMD jaguar ?
-C Intel P4 ?
-C Intel core 4
-C Intel NHM 3.6
-C Intel SBR 3.15
-C Intel IBR 3.2
-C Intel HWL 2.6
-C Intel BWL ?
-C Intel atom 14
-C VIA nano 3.5
-
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-define(`rp', `%rdi')
-define(`tp', `%rsi')
-define(`up_arg', `%rdx')
-define(`n', `%rcx')
-
-define(`up', `%r11')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(mpn_sqr_diag_addlsh1)
- FUNC_ENTRY(4)
- push %rbx
-
- dec n
- shl n
-
- mov (up_arg), %rax
-
- lea (rp,n,8), rp
- lea (tp,n,8), tp
- lea (up_arg,n,4), up
- neg n
-
- mul %rax
- mov %rax, (rp,n,8)
-
- xor R32(%rbx), R32(%rbx)
- jmp L(mid)
-
- ALIGN(16)
-L(top): add %r10, %r8
- adc %rax, %r9
- mov %r8, -8(rp,n,8)
- mov %r9, (rp,n,8)
-L(mid): mov 8(up,n,4), %rax
- mov (tp,n,8), %r8
- mov 8(tp,n,8), %r9
- adc %r8, %r8
- adc %r9, %r9
- lea (%rdx,%rbx), %r10
- setc R8(%rbx)
- mul %rax
- add $2, n
- js L(top)
-
-L(end): add %r10, %r8
- adc %rax, %r9
- mov %r8, I(-8(rp),-8(rp,n,8))
- mov %r9, I((rp),(rp,n,8))
- adc %rbx, %rdx
- mov %rdx, I(8(rp),8(rp,n,8))
-
- pop %rbx
- FUNC_EXIT()
- ret
-EPILOGUE()
diff --git a/gmp/mpn/x86_64/sublsh1_n.asm b/gmp/mpn/x86_64/sublsh1_n.asm
index c6d829fcb2..a943ed1579 100644
--- a/gmp/mpn/x86_64/sublsh1_n.asm
+++ b/gmp/mpn/x86_64/sublsh1_n.asm
@@ -1,44 +1,31 @@
dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
-dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
-dnl
+
dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
-dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
-dnl
-dnl or both in parallel, as here.
-dnl
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
-dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C AMD K8,K9 2.2
-C AMD K10 2.2
-C Intel P4 12.75
-C Intel core2 3.45
-C Intel corei ?
-C Intel atom ?
-C VIA nano 3.25
+C K8,K9: 2.2
+C K10: 2.2
+C P4: 12.75
+C P6-15: 3.45
+
C Sometimes speed degenerates, supposedly related to that some operand
C alignments cause cache conflicts.
@@ -52,14 +39,10 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_sublsh1_n)
- FUNC_ENTRY(4)
push %rbx
push %rbp
@@ -119,7 +102,7 @@ L(b01): add %r8, %r8
L(ent): jns L(end)
ALIGN(16)
-L(top): add R32(%rax), R32(%rax) C restore scy
+L(oop): add R32(%rax), R32(%rax) C restore scy
mov (vp,n,8), %r8
L(b00): adc %r8, %r8
@@ -148,13 +131,12 @@ L(b00): adc %r8, %r8
sbb R32(%rbp), R32(%rbp) C save acy
add $4, n
- js L(top)
+ js L(oop)
L(end): add R32(%rbp), R32(%rax)
neg R32(%rax)
pop %rbp
pop %rbx
- FUNC_EXIT()
ret
EPILOGUE()
diff --git a/gmp/mpn/x86_64/x86_64-defs.m4 b/gmp/mpn/x86_64/x86_64-defs.m4
index 366598b41d..fc296c2a1e 100644
--- a/gmp/mpn/x86_64/x86_64-defs.m4
+++ b/gmp/mpn/x86_64/x86_64-defs.m4
@@ -2,78 +2,30 @@ divert(-1)
dnl m4 macros for amd64 assembler.
-dnl Copyright 1999-2005, 2008, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of either:
-dnl
-dnl * the GNU Lesser General Public License as published by the Free
-dnl Software Foundation; either version 3 of the License, or (at your
-dnl option) any later version.
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005 Free Software
+dnl Foundation, Inc.
dnl
-dnl or
-dnl
-dnl * the GNU General Public License as published by the Free Software
-dnl Foundation; either version 2 of the License, or (at your option) any
-dnl later version.
+dnl This file is part of the GNU MP Library.
dnl
-dnl or both in parallel, as here.
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-dnl for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
dnl
-dnl You should have received copies of the GNU General Public License and the
-dnl GNU Lesser General Public License along with the GNU MP Library. If not,
-dnl see https://www.gnu.org/licenses/.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-dnl Usage: CPUVEC_FUNCS_LIST
+dnl Notes:
dnl
-dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
-dnl order they appear in that structure.
-
-define(CPUVEC_FUNCS_LIST,
-``add_n',
-`addlsh1_n',
-`addlsh2_n',
-`addmul_1',
-`addmul_2',
-`bdiv_dbm1c',
-`cnd_add_n',
-`cnd_sub_n',
-`com',
-`copyd',
-`copyi',
-`divexact_1',
-`divrem_1',
-`gcd_1',
-`lshift',
-`lshiftc',
-`mod_1',
-`mod_1_1p',
-`mod_1_1p_cps',
-`mod_1s_2p',
-`mod_1s_2p_cps',
-`mod_1s_4p',
-`mod_1s_4p_cps',
-`mod_34lsub1',
-`modexact_1c_odd',
-`mul_1',
-`mul_basecase',
-`mullo_basecase',
-`preinv_divrem_1',
-`preinv_mod_1',
-`redc_1',
-`redc_2',
-`rshift',
-`sqr_basecase',
-`sub_n',
-`sublsh1_n',
-`submul_1'')
+dnl The 32-bit mode x86/x86-defs.m4 has various 32bit-isms, like the
+dnl profiling calls, so it seems cleanest to start a fresh set of defines
+dnl for 64-bit mode.
dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
@@ -135,11 +87,8 @@ define(`ASSERT_counter',incr(ASSERT_counter))')')')
define(ASSERT_counter,1)
-define(`LEA',`dnl
-ifdef(`PIC',
- `mov $1@GOTPCREL(%rip), $2'
-,
- `movabs `$'$1, $2')
+define(`LEA',`
+ mov $1@GOTPCREL(%rip), $2
')
@@ -192,163 +141,11 @@ define(`R8',
dnl Usage: CALL(funcname)
dnl
-define(`CALL',`dnl
ifdef(`PIC',
- `call GSYM_PREFIX`'$1@PLT'
-,
- `call GSYM_PREFIX`'$1'
-)')
+ `define(`CALL',`call GSYM_PREFIX`'$1@PLT')',
+ `define(`CALL',`call GSYM_PREFIX`'$1')')
define(`JUMPTABSECT', `.section .data.rel.ro.local,"aw",@progbits')
-
-dnl Usage: JMPENT(targlabel,tablabel)
-
-define(`JMPENT',`dnl
-ifdef(`PIC',
- `.long $1-$2'
-,
- `.quad $1'
-)')
-
-
-dnl These macros are defined just for DOS64, where they provide calling
-dnl sequence glue code.
-
-define(`FUNC_ENTRY',`')
-define(`FUNC_EXIT',`')
-
-
-dnl Target ABI macros.
-
-define(`IFDOS', `')
-define(`IFSTD', `$1')
-define(`IFELF', `$1')
-
-
-dnl Usage: PROTECT(symbol)
-dnl
-dnl Used for private GMP symbols that should never be overridden by users.
-dnl This can save reloc entries and improve shlib sharing as well as
-dnl application startup times
-
-define(`PROTECT', `.hidden $1')
-
-
-dnl Usage: x86_lookup(target, key,value, key,value, ...)
-dnl
-dnl Look for `target' among the `key' parameters.
-dnl
-dnl x86_lookup expands to the corresponding `value', or generates an error
-dnl if `target' isn't found.
-
-define(x86_lookup,
-m4_assert_numargs_range(1,999)
-`ifelse(eval($#<3),1,
-`m4_error(`unrecognised part of x86 instruction: $1
-')',
-`ifelse(`$1',`$2', `$3',
-`x86_lookup(`$1',shift(shift(shift($@))))')')')
-
-
-dnl Usage: x86_opcode_regxmm(reg)
-dnl
-dnl Validate the given xmm register, and return its number, 0 to 7.
-
-define(x86_opcode_regxmm,
-m4_assert_numargs(1)
-`x86_lookup(`$1',x86_opcode_regxmm_list)')
-
-define(x86_opcode_regxmm_list,
-``%xmm0',0,
-`%xmm1',1,
-`%xmm2',2,
-`%xmm3',3,
-`%xmm4',4,
-`%xmm5',5,
-`%xmm6',6,
-`%xmm7',7,
-`%xmm8',8,
-`%xmm9',9,
-`%xmm10',10,
-`%xmm11',11,
-`%xmm12',12,
-`%xmm13',13,
-`%xmm14',14,
-`%xmm15',15')
-
-dnl Usage: palignr($imm,%srcreg,%dstreg)
-dnl
-dnl Emit a palignr instruction, using a .byte sequence, since obsolete but
-dnl still distributed versions of gas don't know SSSE3 instructions.
-
-define(`palignr',
-m4_assert_numargs(3)
-`.byte 0x66,dnl
-ifelse(eval(x86_opcode_regxmm($3) >= 8 || x86_opcode_regxmm($2) >= 8),1,
- `eval(0x40+x86_opcode_regxmm($3)/8*4+x86_opcode_regxmm($2)/8),')dnl
-0x0f,0x3a,0x0f,dnl
-eval(0xc0+x86_opcode_regxmm($3)%8*8+x86_opcode_regxmm($2)%8),dnl
-substr($1,1)')
-
-
-dnl Usage
-dnl
-dnl regnum(op) raw operand index (so slightly misnamed)
-dnl regnumh(op) high bit of register operand nimber
-dnl ix(op) 0 for reg operand, 1 for plain pointer operand.
-dnl
-
-define(`regnum',`x86_lookup(`$1',oplist)')
-define(`regnumh',`eval(regnum($1)/8 & 1)')
-define(`ix',`eval(regnum($1)/16)')
-define(`oplist',
-``%rax', 0, `%rcx', 1, `%rdx', 2, `%rbx', 3,
- `%rsp', 4, `%rbp', 5, `%rsi', 6, `%rdi', 7,
- `%r8', 8, `%r9', 9, `%r10', 10, `%r11', 11,
- `%r12', 12, `%r13', 13, `%r14', 14, `%r15', 15,
- `(%rax)',16, `(%rcx)',17, `(%rdx)',18, `(%rbx)',19,
- `(%rsp)',20, `(%rbp)',21, `(%rsi)',22, `(%rdi)',23,
- `(%r8)', 24, `(%r9)', 25, `(%r10)',26, `(%r11)',27,
- `(%r12)',28, `(%r13)',29, `(%r14)',30, `(%r15)' 31')
-
-
-dnl Usage
-dnl
-dnl mulx(reg1,reg2,reg3)
-dnl
-dnl or
-dnl
-dnl mulx((reg1),reg2,reg3)
-dnl
-dnl where reg1 is any register but rsp,rbp,r12,r13, or
-dnl
-dnl mulx(off,(reg1),reg2,reg3)
-dnl
-dnl where reg1 is any register but rsp,r12.
-dnl
-dnl The exceptions are due to special coding needed for some registers; rsp
-dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the
-dnl offset-less form.
-dnl
-dnl Other addressing forms are not handled. Invalid forms are not properly
-dnl detected. Offsets that don't fit one byte are not handled correctly.
-
-define(`mulx',`dnl
-ifelse($#,3,
-`.byte 0xc4`'dnl
-,0x`'eval(0xe2^32*regnumh($1)^128*regnumh($3),16)`'dnl
-,0x`'eval(0xfb-8*regnum($2),16)`'dnl
-,0xf6`'dnl
-,0x`'eval(0xc0+(7 & regnum($1))+8*(7 & regnum($3))-0xc0*ix($1),16)`'dnl
-',$#,4,
-`.byte 0xc4`'dnl
-,0x`'eval(0xe2^32*regnumh($2)^128*regnumh($4),16)`'dnl
-,0x`'eval(0xfb-8*regnum($3),16)`'dnl
-,0xf6`'dnl
-,0x`'eval(0x40+(7 & regnum($2))+8*(7 & regnum($4)),16)`'dnl
-,0x`'eval(($1 + 256) % 256,16)`'dnl
-')')
-
divert`'dnl
diff --git a/gmp/mpn/z8000/README b/gmp/mpn/z8000/README
new file mode 100644
index 0000000000..e1cf22df42
--- /dev/null
+++ b/gmp/mpn/z8000/README
@@ -0,0 +1,45 @@
+Copyright 2003, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+
+
+
+ Z8000 MPN SUBROUTINES
+
+
+This directory contains mpn functions for the Zilog Z8000.
+
+
+STATUS
+
+This code is old and has not been used for a long time.
+
+mpn/z8000 uses a 16-bit limb, it's possible this doesn't really work, on
+account of various bits of C code assuming limb>=long and of course long is
+invariably at least 32 bits.
+
+mpn/z8000x uses a 32-bit limb, this could perhaps be an ABI choice.
+Currently it's reached only by an MPN_PATH override.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/gmp/mpn/z8000/add_n.s b/gmp/mpn/z8000/add_n.s
new file mode 100644
index 0000000000..89fbb1a280
--- /dev/null
+++ b/gmp/mpn/z8000/add_n.s
@@ -0,0 +1,51 @@
+! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ unseg
+ .text
+ even
+ global ___gmpn_add_n
+___gmpn_add_n:
+ pop r0,@r6
+ pop r1,@r5
+ add r0,r1
+ ld @r7,r0
+ dec r4
+ jr eq,Lend
+Loop: pop r0,@r6
+ pop r1,@r5
+ adc r0,r1
+ inc r7,#2
+ ld @r7,r0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ adc r2,r2
+ ret t
diff --git a/gmp/mpn/z8000/gmp-mparam.h b/gmp/mpn/z8000/gmp-mparam.h
new file mode 100644
index 0000000000..f42e380a70
--- /dev/null
+++ b/gmp/mpn/z8000/gmp-mparam.h
@@ -0,0 +1,21 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define BITS_PER_MP_LIMB 16
+#define BYTES_PER_MP_LIMB 2
diff --git a/gmp/mpn/z8000/mul_1.s b/gmp/mpn/z8000/mul_1.s
new file mode 100644
index 0000000000..fa92bc32bf
--- /dev/null
+++ b/gmp/mpn/z8000/mul_1.s
@@ -0,0 +1,66 @@
+! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! size r5
+! s2_limb r4
+
+ unseg
+ .text
+ even
+ global ___gmpn_mul_1
+___gmpn_mul_1:
+ sub r2,r2 ! zero carry limb
+ and r4,r4
+ jr mi,Lneg
+
+Lpos: pop r1,@r6
+ ld r9,r1
+ mult rr8,r4
+ and r1,r1 ! shift msb of loaded limb into cy
+ jr mi,Lp ! branch if loaded limb's msb is set
+ add r8,r4 ! hi_limb += sign_comp2
+Lp: add r9,r2 ! lo_limb += cy_limb
+ xor r2,r2
+ adc r2,r8
+ ld @r7,r9
+ inc r7,#2
+ dec r5
+ jr ne,Lpos
+ ret t
+
+Lneg: pop r1,@r6
+ ld r9,r1
+ mult rr8,r4
+ add r8,r1 ! hi_limb += sign_comp1
+ and r1,r1
+ jr mi,Ln
+ add r8,r4 ! hi_limb += sign_comp2
+Ln: add r9,r2 ! lo_limb += cy_limb
+ xor r2,r2
+ adc r2,r8
+ ld @r7,r9
+ inc r7,#2
+ dec r5
+ jr ne,Lneg
+ ret t
diff --git a/gmp/mpn/z8000/sub_n.s b/gmp/mpn/z8000/sub_n.s
new file mode 100644
index 0000000000..1dbd83760e
--- /dev/null
+++ b/gmp/mpn/z8000/sub_n.s
@@ -0,0 +1,52 @@
+! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ unseg
+ .text
+ even
+ global ___gmpn_sub_n
+___gmpn_sub_n:
+ pop r0,@r6
+ pop r1,@r5
+ sub r0,r1
+ ld @r7,r0
+ dec r4
+ jr eq,Lend
+Loop: pop r0,@r6
+ pop r1,@r5
+ sbc r0,r1
+ inc r7,#2
+ ld @r7,r0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ adc r2,r2
+ ret t
diff --git a/gmp/mpn/z8000x/add_n.s b/gmp/mpn/z8000x/add_n.s
new file mode 100644
index 0000000000..26b47e278b
--- /dev/null
+++ b/gmp/mpn/z8000x/add_n.s
@@ -0,0 +1,54 @@
+! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal,
+! non-zero length.
+
+! Copyright 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ segm
+ .text
+ even
+ global ___gmpn_add_n
+___gmpn_add_n:
+ popl rr0,@r6
+ popl rr8,@r5
+ addl rr0,rr8
+ ldl @r7,rr0
+ dec r4
+ jr eq,Lend
+Loop: popl rr0,@r6
+ popl rr8,@r5
+ adc r1,r9
+ adc r0,r8
+ inc r7,#4
+ ldl @r7,rr0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ ld r3,r4
+ adc r2,r2
+ ret t
diff --git a/gmp/mpn/z8000x/sub_n.s b/gmp/mpn/z8000x/sub_n.s
new file mode 100644
index 0000000000..837ecef0cf
--- /dev/null
+++ b/gmp/mpn/z8000x/sub_n.s
@@ -0,0 +1,54 @@
+! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the
+! same length > 0 and store difference in a third limb vector.
+
+! Copyright 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 3 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+! INPUT PARAMETERS
+! res_ptr r7
+! s1_ptr r6
+! s2_ptr r5
+! size r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp. We'd
+! then add 2x the number of words written to r7...
+
+ segm
+ .text
+ even
+ global ___gmpn_sub_n
+___gmpn_sub_n:
+ popl rr0,@r6
+ popl rr8,@r5
+ subl rr0,rr8
+ ldl @r7,rr0
+ dec r4
+ jr eq,Lend
+Loop: popl rr0,@r6
+ popl rr8,@r5
+ sbc r1,r9
+ sbc r0,r8
+ inc r7,#4
+ ldl @r7,rr0
+ dec r4
+ jr ne,Loop
+Lend: ld r2,r4 ! use 0 already in r4
+ ld r3,r4
+ adc r2,r2
+ ret t