diff options
Diffstat (limited to 'mpn/powerpc32')
-rw-r--r-- | mpn/powerpc32/aors_n.asm | 19 | ||||
-rw-r--r-- | mpn/powerpc32/p3-p7/aors_n.asm | 176 | ||||
-rw-r--r-- | mpn/powerpc32/p5/gmp-mparam.h | 137 | ||||
-rw-r--r-- | mpn/powerpc32/p6/gmp-mparam.h | 206 | ||||
-rw-r--r-- | mpn/powerpc32/p7/gmp-mparam.h | 149 | ||||
-rw-r--r-- | mpn/powerpc32/tabselect.asm | 98 |
6 files changed, 613 insertions, 172 deletions
diff --git a/mpn/powerpc32/aors_n.asm b/mpn/powerpc32/aors_n.asm index f9e9b50d5..12115a9e9 100644 --- a/mpn/powerpc32/aors_n.asm +++ b/mpn/powerpc32/aors_n.asm @@ -19,14 +19,17 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C 603e: ? -C 604e: ? old: 3.25 -C 75x (G3): ? old: 3.5 -C 7400,7410 (G4): 3.25 -C 744x,745x (G4+): 4 -C power4/ppc970: ? old: 2.0 -C power5: ? old: 2.5 +C cycles/limb +C 603e: ? +C 604e: ? old: 3.25 +C 75x (G3): ? old: 3.5 +C 7400,7410 (G4): 3.25 +C 744x,745x (G4+): 4 +C POWER3/PPC630 2 +C POWER4/PPC970 2.4 +C POWER5 2.75 +C POWER6 40-140 +C POWER7 3 C INPUT PARAMETERS define(`rp', `r3') diff --git a/mpn/powerpc32/p3-p7/aors_n.asm b/mpn/powerpc32/p3-p7/aors_n.asm new file mode 100644 index 000000000..6999182a8 --- /dev/null +++ b/mpn/powerpc32/p3-p7/aors_n.asm @@ -0,0 +1,176 @@ +dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.78 +C POWER7 2.15-2.87 + +C This code is based on powerpc64/aors_n.asm. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): stw r31, -4(r1) + stw r30, -8(r1) + stw r29, -12(r1) + stw r28, -16(r1) + + rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0 + cmpwi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srwi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): lwz r8, 0(r4) C load s1 limb + lwz r9, 0(r5) C load s2 limb + lwz r10, 4(r4) C load s1 limb + lwz r11, 4(r5) C load s2 limb + lwz r12, 8(r4) C load s1 limb + addi r4, r4, 12 + lwz r0, 8(r5) C load s2 limb + addi r5, r5, 12 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r29, 0(r3) + stw r30, 4(r3) + stw r31, 8(r3) + addi r3, r3, 12 + bdnz L(go) + b L(ret) + +L(b01): lwz r12, 0(r4) C load s1 limb + addi r4, r4, 4 + lwz r0, 0(r5) C load s2 limb + addi r5, r5, 4 + ADDSUBC r31, r0, r12 C add + stw r31, 0(r3) + addi r3, r3, 4 + bdnz L(go) + b L(ret) + +L(b10): lwz r10, 0(r4) C load s1 limb + lwz r11, 0(r5) C load s2 limb + lwz r12, 4(r4) C load s1 limb + addi r4, r4, 8 + lwz r0, 4(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + stw r30, 0(r3) + stw r31, 4(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 16 + addi r5, r5, 16 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + ADDSUBC r30, r11, r10 + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + ADDSUBC r31, r0, r12 + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + stw r28, 0(r3) + addi r4, r4, 16 + stw r29, 4(r3) + addi r5, r5, 16 + stw r30, 8(r3) + stw r31, 12(r3) + addi r3, r3, 16 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r28, 0(r3) + stw r29, 4(r3) + stw r30, 8(r3) + stw r31, 12(r3) + +L(ret): lwz r31, -4(r1) + lwz r30, -8(r1) + lwz r29, -12(r1) + lwz r28, -16(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/mpn/powerpc32/p5/gmp-mparam.h b/mpn/powerpc32/p5/gmp-mparam.h index a8400ce65..ba210ecc4 100644 --- a/mpn/powerpc32/p5/gmp-mparam.h +++ b/mpn/powerpc32/p5/gmp-mparam.h @@ -30,114 +30,117 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 8 #define MOD_1U_TO_MOD_1_1_THRESHOLD 6 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ -#define BMOD_1_TO_MOD_1_THRESHOLD 62 +#define BMOD_1_TO_MOD_1_THRESHOLD 61 #define MUL_TOOM22_THRESHOLD 22 -#define MUL_TOOM33_THRESHOLD 78 +#define MUL_TOOM33_THRESHOLD 57 #define MUL_TOOM44_THRESHOLD 130 -#define MUL_TOOM6H_THRESHOLD 206 -#define MUL_TOOM8H_THRESHOLD 260 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 309 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 #define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 -#define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 42 +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 40 #define SQR_TOOM3_THRESHOLD 77 -#define SQR_TOOM4_THRESHOLD 169 -#define SQR_TOOM6_THRESHOLD 246 -#define SQR_TOOM8_THRESHOLD 381 +#define SQR_TOOM4_THRESHOLD 124 +#define SQR_TOOM6_THRESHOLD 140 +#define SQR_TOOM8_THRESHOLD 238 + +#define MULMID_TOOM42_THRESHOLD 40 #define MULMOD_BNM1_THRESHOLD 15 -#define SQRMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define POWM_SEC_TABLE 4,29,252,840,2080 -#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ - { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \ - { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ - { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ - { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ - { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ - { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ - { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ - { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ - { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ - { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ - { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ - { 575,10}, { 159,11}, { 95, 9}, { 383,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ - { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ - { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ - { 767,10}, { 415, 9}, { 831,11}, { 223,12}, \ - { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 76 + { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 71 #define MUL_FFT_THRESHOLD 4736 -#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ - { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ - { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ - { 19, 6}, { 77, 7}, { 39, 8}, { 23, 7}, \ - { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \ - { 23, 8}, { 47,10}, { 15, 7}, { 121, 9}, \ - { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ - { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ - { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ - { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ - { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \ + { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ - { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \ - { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 88 +#define SQR_FFT_TABLE3_SIZE 76 #define SQR_FFT_THRESHOLD 3712 #define MULLO_BASECASE_THRESHOLD 2 #define MULLO_DC_THRESHOLD 68 #define MULLO_MUL_N_THRESHOLD 9236 -#define DC_DIV_QR_THRESHOLD 70 -#define DC_DIVAPPR_Q_THRESHOLD 238 +#define DC_DIV_QR_THRESHOLD 69 +#define DC_DIVAPPR_Q_THRESHOLD 220 #define DC_BDIV_QR_THRESHOLD 75 #define DC_BDIV_Q_THRESHOLD 188 #define INV_MULMOD_BNM1_THRESHOLD 54 -#define INV_NEWTON_THRESHOLD 250 -#define INV_APPR_THRESHOLD 246 +#define INV_NEWTON_THRESHOLD 230 +#define INV_APPR_THRESHOLD 230 -#define BINV_NEWTON_THRESHOLD 375 +#define BINV_NEWTON_THRESHOLD 278 #define REDC_1_TO_REDC_N_THRESHOLD 87 -#define MU_DIV_QR_THRESHOLD 1334 -#define MU_DIVAPPR_Q_THRESHOLD 1387 -#define MUPI_DIV_QR_THRESHOLD 114 -#define MU_BDIV_QR_THRESHOLD 1078 -#define MU_BDIV_Q_THRESHOLD 1334 +#define MU_DIV_QR_THRESHOLD 1210 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1210 #define MATRIX22_STRASSEN_THRESHOLD 14 -#define HGCD_THRESHOLD 104 -#define GCD_DC_THRESHOLD 424 -#define GCDEXT_DC_THRESHOLD 321 +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 2578 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 298 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 12 -#define GET_STR_PRECOMPUTE_THRESHOLD 23 -#define SET_STR_DC_THRESHOLD 454 -#define SET_STR_PRECOMPUTE_THRESHOLD 1074 +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 527 +#define SET_STR_PRECOMPUTE_THRESHOLD 1090 diff --git a/mpn/powerpc32/p6/gmp-mparam.h b/mpn/powerpc32/p6/gmp-mparam.h index 73951d0ae..529a66d19 100644 --- a/mpn/powerpc32/p6/gmp-mparam.h +++ b/mpn/powerpc32/p6/gmp-mparam.h @@ -29,115 +29,127 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_NORM_THRESHOLD 3 #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 3 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 #define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ -#define MUL_TOOM22_THRESHOLD 34 -#define MUL_TOOM33_THRESHOLD 70 -#define MUL_TOOM44_THRESHOLD 187 -#define MUL_TOOM6H_THRESHOLD 286 -#define MUL_TOOM8H_THRESHOLD 321 +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 55 +#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM6H_THRESHOLD 137 +#define MUL_TOOM8H_THRESHOLD 181 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 110 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 118 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 107 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 #define SQR_BASECASE_THRESHOLD 0 /* always */ -#define SQR_TOOM2_THRESHOLD 68 -#define SQR_TOOM3_THRESHOLD 113 -#define SQR_TOOM4_THRESHOLD 312 -#define SQR_TOOM6_THRESHOLD 330 -#define SQR_TOOM8_THRESHOLD 357 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 56 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 296 -#define MULMOD_BNM1_THRESHOLD 19 -#define SQRMOD_BNM1_THRESHOLD 20 +#define MULMID_TOOM42_THRESHOLD 26 -#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */ +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define POWM_SEC_TABLE 2,26,127,453,1068 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ - { 10, 5}, { 21, 6}, { 17, 7}, { 9, 6}, \ - { 20, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \ - { 7, 7}, { 21, 8}, { 11, 7}, { 27, 9}, \ - { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ - { 41, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \ { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ - { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ - { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ - { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ - { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ - { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ - { 191, 8}, { 383,11}, { 63,10}, { 127, 9}, \ - { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \ - { 287,10}, { 159, 9}, { 319,11}, { 95,10}, \ - { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ - { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ - { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \ - { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ - { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \ - { 8192,14}, { 16384,15}, { 32768,16} } -#define MUL_FFT_TABLE3_SIZE 83 -#define MUL_FFT_THRESHOLD 4736 - -#define SQR_FFT_MODF_THRESHOLD 312 /* k = 5 */ -#define SQR_FFT_TABLE3 \ - { { 312, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ - { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ - { 27, 7}, { 17, 6}, { 35, 7}, { 21, 8}, \ - { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ - { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ - { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ - { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ - { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \ { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ - { 127, 8}, { 255,10}, { 79, 9}, { 159, 8}, \ - { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \ - { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \ + { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ - { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \ - { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ - { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ - { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \ - { 191,10}, { 383, 9}, { 767,10}, { 415,11}, \ - { 223,10}, { 447,12}, { 4096,13}, { 8192,14}, \ - { 16384,15}, { 32768,16} } -#define SQR_FFT_TABLE3_SIZE 78 -#define SQR_FFT_THRESHOLD 2752 - -#define MULLO_BASECASE_THRESHOLD 0 /* always */ -#define MULLO_DC_THRESHOLD 151 -#define MULLO_MUL_N_THRESHOLD 1175 - -#define DC_DIV_QR_THRESHOLD 133 -#define DC_DIVAPPR_Q_THRESHOLD 442 -#define DC_BDIV_QR_THRESHOLD 130 -#define DC_BDIV_Q_THRESHOLD 324 - -#define INV_MULMOD_BNM1_THRESHOLD 116 -#define INV_NEWTON_THRESHOLD 507 -#define INV_APPR_THRESHOLD 454 - -#define BINV_NEWTON_THRESHOLD 507 -#define REDC_1_TO_REDC_N_THRESHOLD 118 - -#define MU_DIV_QR_THRESHOLD 1652 -#define MU_DIVAPPR_Q_THRESHOLD 1752 -#define MUPI_DIV_QR_THRESHOLD 225 -#define MU_BDIV_QR_THRESHOLD 762 -#define MU_BDIV_Q_THRESHOLD 1017 - -#define MATRIX22_STRASSEN_THRESHOLD 28 -#define HGCD_THRESHOLD 76 -#define GCD_DC_THRESHOLD 333 -#define GCDEXT_DC_THRESHOLD 245 + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \ + { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 92 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 187 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 146 + +#define INV_MULMOD_BNM1_THRESHOLD 68 +#define INV_NEWTON_THRESHOLD 182 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 186 +#define REDC_1_TO_REDC_N_THRESHOLD 60 + +#define MU_DIV_QR_THRESHOLD 924 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 823 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 974 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 134 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 10 -#define GET_STR_PRECOMPUTE_THRESHOLD 20 -#define SET_STR_DC_THRESHOLD 199 -#define SET_STR_PRECOMPUTE_THRESHOLD 478 +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 190 +#define SET_STR_PRECOMPUTE_THRESHOLD 411 diff --git a/mpn/powerpc32/p7/gmp-mparam.h b/mpn/powerpc32/p7/gmp-mparam.h new file mode 100644 index 000000000..bd18d4042 --- /dev/null +++ b/mpn/powerpc32/p7/gmp-mparam.h @@ -0,0 +1,149 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009, +2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define BYTES_PER_MP_LIMB 4 + +/* 3550 MHz POWER7/T4 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 34 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 363 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 381 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,35,225,780,2212 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 14, 5}, { 29, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671, 8}, { 1343,10}, { 351,11}, \ + { 191,10}, { 415, 9}, { 831,10}, { 431,11}, \ + { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 77 +#define MUL_FFT_THRESHOLD 5312 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543, 8}, { 1087,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 79 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 202 +#define DC_BDIV_QR_THRESHOLD 68 +#define DC_BDIV_Q_THRESHOLD 152 + +#define INV_MULMOD_BNM1_THRESHOLD 66 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 189 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 91 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1442 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 126 +#define HGCD_APPR_THRESHOLD 139 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 573 +#define GCDEXT_DC_THRESHOLD 448 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 834 +#define SET_STR_PRECOMPUTE_THRESHOLD 1888 diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm new file mode 100644 index 000000000..155a7b495 --- /dev/null +++ b/mpn/powerpc32/tabselect.asm @@ -0,0 +1,98 @@ +dnl PowerPC-32 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): ? +C 744x,745x (G4+): ? +C power4/ppc970: 3.3 +C power5: ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using VMX could result in significant speedup for certain CPUs. + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`mask', `r8') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + addi r0, n, 1 + srwi r0, r0, 1 C inner loop count + andi. r9, n, 1 C set cr0 for use in inner loop + subf which, nents, which + slwi n, n, 2 + +L(outer): + mtctr r0 C put inner loop count in ctr + + add r9, which, nents C are we at the selected table entry? + addic r9, r9, -1 C set CF iff not selected entry + subfe mask, r0, r0 + + beq cr0, L(top) C branch to loop entry if n even + + lwz r9, 0(tp) + addi tp, tp, 4 + and r9, r9, mask + lwz r11, 0(rp) + andc r11, r11, mask + or r9, r9, r11 + stw r9, 0(rp) + addi rp, rp, 4 + bdz L(end) + + ALIGN(16) +L(top): lwz r9, 0(tp) + lwz r10, 4(tp) + addi tp, tp, 8 + nop + and r9, r9, mask + and r10, r10, mask + lwz r11, 0(rp) + lwz r12, 4(rp) + andc r11, r11, mask + andc r12, r12, mask + or r9, r9, r11 + or r10, r10, r12 + stw r9, 0(rp) + stw r10, 4(rp) + addi rp, rp, 8 + bdnz L(top) + +L(end): subf rp, n, rp C move rp back to beginning + cmpwi cr6, nents, 1 + addi nents, nents, -1 + bne cr6, L(outer) + + blr +EPILOGUE() |