summaryrefslogtreecommitdiff
path: root/mpn/powerpc64
diff options
context:
space:
mode:
Diffstat (limited to 'mpn/powerpc64')
-rw-r--r--mpn/powerpc64/com.asm9
-rw-r--r--mpn/powerpc64/copyd.asm9
-rw-r--r--mpn/powerpc64/copyi.asm9
-rw-r--r--mpn/powerpc64/logops_n.asm9
-rw-r--r--mpn/powerpc64/lshift.asm11
-rw-r--r--mpn/powerpc64/lshiftc.asm (renamed from mpn/powerpc64/mode64/lshiftc.asm)16
-rw-r--r--mpn/powerpc64/mode64/aors_n.asm14
-rw-r--r--mpn/powerpc64/mode64/aorscnd_n.asm185
-rw-r--r--mpn/powerpc64/mode64/aorslshC_n.asm11
-rw-r--r--mpn/powerpc64/mode64/aorsmul_1.asm15
-rw-r--r--mpn/powerpc64/mode64/bdiv_dbm1c.asm4
-rw-r--r--mpn/powerpc64/mode64/dive_1.asm11
-rw-r--r--mpn/powerpc64/mode64/divrem_1.asm13
-rw-r--r--mpn/powerpc64/mode64/divrem_2.asm11
-rw-r--r--mpn/powerpc64/mode64/invert_limb.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_4.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_34lsub1.asm11
-rw-r--r--mpn/powerpc64/mode64/mode1o.asm10
-rw-r--r--mpn/powerpc64/mode64/mul_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mul_basecase.asm12
-rw-r--r--mpn/powerpc64/mode64/p3/gmp-mparam.h73
-rw-r--r--mpn/powerpc64/mode64/p4/gmp-mparam.h31
-rw-r--r--mpn/powerpc64/mode64/p5/gmp-mparam.h41
-rw-r--r--mpn/powerpc64/mode64/p6/aorsmul_1.asm172
-rw-r--r--mpn/powerpc64/mode64/p6/gmp-mparam.h85
-rw-r--r--mpn/powerpc64/mode64/p6/mul_basecase.asm2
-rw-r--r--mpn/powerpc64/mode64/p7/gmp-mparam.h159
-rw-r--r--mpn/powerpc64/mode64/rsh1add_n.asm11
-rw-r--r--mpn/powerpc64/mode64/rsh1sub_n.asm11
-rw-r--r--mpn/powerpc64/mode64/sqr_basecase.asm852
-rw-r--r--mpn/powerpc64/mode64/sqr_diag_addlsh1.asm238
-rw-r--r--mpn/powerpc64/rshift.asm11
-rw-r--r--mpn/powerpc64/tabselect.asm96
34 files changed, 1738 insertions, 448 deletions
diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm
index 4fb2e65d7..cb89bade2 100644
--- a/mpn/powerpc64/com.asm
+++ b/mpn/powerpc64/com.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1?
-C POWER4/PPC970: 1.6
+C cycles/limb
+C POWER3/PPC630 1?
+C POWER4/PPC970 1.6
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.45
C TODO
C * 8-way unrolling brings timing down to about 1.3 cycles/limb.
diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm
index 6a46a433c..256e7dc12 100644
--- a/mpn/powerpc64/copyd.asm
+++ b/mpn/powerpc64/copyd.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm
index 5cb7e4856..31d1fc2e7 100644
--- a/mpn/powerpc64/copyi.asm
+++ b/mpn/powerpc64/copyi.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm
index 917b59f45..2caa2c7c6 100644
--- a/mpn/powerpc64/logops_n.asm
+++ b/mpn/powerpc64/logops_n.asm
@@ -20,9 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1.75
-C POWER4/PPC970: 2.10
+C cycles/limb
+C POWER3/PPC630 1.75
+C POWER4/PPC970 2.10
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.75
C n POWER3/PPC630 POWER4/PPC970
C 1 15.00 15.33
diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm
index f97661ae7..eb70c4983 100644
--- a/mpn/powerpc64/lshift.asm
+++ b/mpn/powerpc64/lshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm
index 647244d1f..8f470a5f4 100644
--- a/mpn/powerpc64/mode64/lshiftc.asm
+++ b/mpn/powerpc64/lshiftc.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.5
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.5
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
@@ -189,6 +190,9 @@ L(cj2): std r10, -32(rp)
L(ret): ld r31, -8(r1)
ld r30, -16(r1)
- mr r3, retval
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
blr
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm
index 980525f67..8c30871c2 100644
--- a/mpn/powerpc64/mode64/aors_n.asm
+++ b/mpn/powerpc64/mode64/aors_n.asm
@@ -1,6 +1,6 @@
dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,11 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.5
-C POWER4/PPC970 2
-C POWER5 2.25
-C POWER6 2.63
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.63
+C POWER7 2.25-2.87
C This code is a little bit slower for POWER3/PPC630 than the simple code used
C previously, but it is much faster for POWER4/PPC970. The reason for the
@@ -136,6 +137,7 @@ L(go): ld r6, 0(r4) C load s1 limb
addi r4, r4, 32
addi r5, r5, 32
+ ALIGN(16)
L(top): ADDSUBC r28, r7, r6
ld r6, 0(r4) C load s1 limb
ld r7, 0(r5) C load s2 limb
diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm
new file mode 100644
index 000000000..47aa6fb39
--- /dev/null
+++ b/mpn/powerpc64/mode64/aorscnd_n.asm
@@ -0,0 +1,185 @@
+dnl PowerPC-64 mpn_addcnd_n/mpn_subcnd_n.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.25
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+define(`cnd', `r7')
+
+ifdef(`OPERATION_addcnd_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addcnd_n)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_subcnd_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_subcnd_n)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ subfic cnd, cnd, 0
+ subfe cnd, cnd, cnd
+
+ rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srdi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(up) C load s1 limb
+ ld r9, 0(vp) C load s2 limb
+ ld r10, 8(up) C load s1 limb
+ ld r11, 8(vp) C load s2 limb
+ ld r12, 16(up) C load s1 limb
+ addi up, up, 24
+ ld r0, 16(vp) C load s2 limb
+ addi vp, vp, 24
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(rp)
+ std r30, 8(rp)
+ std r31, 16(rp)
+ addi rp, rp, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(up) C load s1 limb
+ addi up, up, 8
+ ld r0, 0(vp) C load s2 limb
+ addi vp, vp, 8
+ and r0, r0, cnd
+ ADDSUB r31, r0, r12 C add
+ std r31, 0(rp)
+ addi rp, rp, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(up) C load s1 limb
+ ld r11, 0(vp) C load s2 limb
+ ld r12, 8(up) C load s1 limb
+ addi up, up, 16
+ ld r0, 8(vp) C load s2 limb
+ addi vp, vp, 16
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(rp)
+ std r31, 8(rp)
+ addi rp, rp, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): CLRCB C clear/set cy
+L(go): ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdz L(end)
+
+ addi up, up, 32
+ addi vp, vp, 32
+
+L(top): ADDSUBC r28, r27, r6
+ ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ std r28, 0(rp)
+ addi up, up, 32
+ std r29, 8(rp)
+ addi vp, vp, 32
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi rp, rp, 32
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r27, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm
index 4622cd946..3776d3e59 100644
--- a/mpn/powerpc64/mode64/aorslshC_n.asm
+++ b/mpn/powerpc64/mode64/aorslshC_n.asm
@@ -17,11 +17,12 @@ dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C POWER3/PPC630 1.83 (1.5 c/l should be possible)
-C POWER4/PPC970 3 (2.0 c/l should be possible)
-C POWER5 3
-C POWER6 3.5-47
+C cycles/limb
+C POWER3/PPC630 1.83 (1.5 c/l should be possible)
+C POWER4/PPC970 3 (2.0 c/l should be possible)
+C POWER5 3
+C POWER6 3.5-47
+C POWER7 3
C STATUS
C * Try combining upx+up, and vpx+vp.
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index b1a3315b6..4b843a044 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C mpn_addmul_1 mpn_submul_1
-C cycles/limb cycles/limb
-C POWER3/PPC630 6-18 6-18
-C POWER4/PPC970 8 8.3
-C POWER5 8 8.25
-C POWER6 16.25 16.75
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8 8.3
+C POWER5 8 8.25
+C POWER6 16.25 16.75
+C POWER7 3.77 4.9
C TODO
C * Try to reduce the number of needed live registers
@@ -53,7 +54,7 @@ ifdef(`OPERATION_submul_1',`
')
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
+
ASM_START()
PROLOGUE(func_nc)
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
index 40f3d4ec7..e88fc4440 100644
--- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm
+++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -19,11 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
+C cycles/limb
C POWER3/PPC630 6-18
C POWER4/PPC970 8.5?
C POWER5 8.5 fluctuating as function of n % 3
C POWER6 15
+C POWER6 15
+C POWER7 4.75
C TODO
C * Nothing to do...
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index d457d65e9..0f94154bf 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm
+C cycles/limb
+C norm unorm
C POWER3/PPC630 13-19
-C POWER4/PPC970 16
-C POWER5 16 16
-C POWER6 37 46
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm
index 9d065b728..c0e7b2a9f 100644
--- a/mpn/powerpc64/mode64/divrem_1.asm
+++ b/mpn/powerpc64/mode64/divrem_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm frac
-C POWER3/PPC630 16-34 16-34 ~11
-C POWER4/PPC970 29 19
-C POWER5 29 29 ~20
-C POWER6 50 59 ~42
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11
+C POWER4/PPC970 29 19
+C POWER5 29 29 ~20
+C POWER6 50 59 ~42
+C POWER7 25 25 ~14
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 53ef1c708..18f549357 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm frac
+C cycles/limb
+C norm frac
C POWER3/PPC630
-C POWER4/PPC970 ? ?
-C POWER5 37 ?
-C POWER6 62 ?
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
+C POWER6 30.5 ?
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm
index aed0a32ab..31b243001 100644
--- a/mpn/powerpc64/mode64/invert_limb.asm
+++ b/mpn/powerpc64/mode64/invert_limb.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb (approximate)
-C POWER3/PPC630 80
-C POWER4/PPC970 86
-C POWER5 86
-C POWER6 170
+C cycles/limb (approximate)
+C POWER3/PPC630 80
+C POWER4/PPC970 86
+C POWER5 86
+C POWER6 170
+C POWER7 66
ASM_START()
PROLOGUE(mpn_invert_limb)
diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm
index 61e39310a..f24ceb2c8 100644
--- a/mpn/powerpc64/mode64/mod_1_1.asm
+++ b/mpn/powerpc64/mode64/mod_1_1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 17
-C POWER5 16
-C POWER6 30
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 17
+C POWER5 16
+C POWER6 30
+C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm
index e0f26da96..b6163c5e7 100644
--- a/mpn/powerpc64/mode64/mod_1_4.asm
+++ b/mpn/powerpc64/mode64/mod_1_4.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 9
-C POWER5 9
-C POWER6 13
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 9
+C POWER5 9
+C POWER6 13
+C POWER7 3.5
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm
index 62ba17a3c..30b9f98be 100644
--- a/mpn/powerpc64/mode64/mod_34lsub1.asm
+++ b/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.33
-C POWER4/PPC970 1.5
-C POWER5 1.32
-C POWER6 2.35
+C cycles/limb
+C POWER3/PPC630 1.33
+C POWER4/PPC970 1.5
+C POWER5 1.32
+C POWER6 2.35
+C POWER7 1
C INPUT PARAMETERS
define(`up',`r3')
diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm
index 489ca8551..37e4028d8 100644
--- a/mpn/powerpc64/mode64/mode1o.asm
+++ b/mpn/powerpc64/mode64/mode1o.asm
@@ -19,10 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 13-19
-C POWER4/PPC970: 16
-C POWER5: 16
+C cycles/limb
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16
+C POWER6 ?
+C POWER7 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 12bff2fb6..e911cf551 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -21,11 +21,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 7.25? not updated for last file revision
-C POWER5 7.25
-C POWER6 14
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
+C POWER7 2.9
C TODO
C * Try to reduce the number of needed live registers (at least r5 and r10
diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm
index fd7ff9aa1..9a3957f94 100644
--- a/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
dnl Foundation, Inc.
@@ -20,11 +20,11 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8
-C POWER5 8
-C POWER6 24
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 24
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h
index 221b0e1d8..cf1d8ca47 100644
--- a/mpn/powerpc64/mode64/p3/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -23,12 +23,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -36,22 +37,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 33
#define MUL_TOOM44_THRESHOLD 46
#define MUL_TOOM6H_THRESHOLD 77
-#define MUL_TOOM8H_THRESHOLD 115
+#define MUL_TOOM8H_THRESHOLD 139
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 38
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 33
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 32
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 16
-#define SQR_TOOM3_THRESHOLD 49
-#define SQR_TOOM4_THRESHOLD 70
-#define SQR_TOOM6_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 48
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 45
+#define SQR_TOOM4_THRESHOLD 64
+#define SQR_TOOM6_THRESHOLD 85
#define SQR_TOOM8_THRESHOLD 139
+#define MULMID_TOOM42_THRESHOLD 22
+
#define MULMOD_BNM1_THRESHOLD 8
-#define SQRMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 10
+
+#define POWM_SEC_TABLE 2,23,127,502,1421
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -123,35 +128,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 118
#define SQR_FFT_THRESHOLD 1728
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 4940
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 27
+#define MULLO_MUL_N_THRESHOLD 2367
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 95
-#define DC_BDIV_QR_THRESHOLD 28
+#define DC_DIV_QR_THRESHOLD 26
+#define DC_DIVAPPR_Q_THRESHOLD 87
+#define DC_BDIV_QR_THRESHOLD 27
#define DC_BDIV_Q_THRESHOLD 62
-#define INV_MULMOD_BNM1_THRESHOLD 29
-#define INV_NEWTON_THRESHOLD 92
-#define INV_APPR_THRESHOLD 94
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 91
+#define INV_APPR_THRESHOLD 91
#define BINV_NEWTON_THRESHOLD 115
-#define REDC_1_TO_REDC_N_THRESHOLD 30
+#define REDC_1_TO_REDC_N_THRESHOLD 31
#define MU_DIV_QR_THRESHOLD 551
#define MU_DIVAPPR_Q_THRESHOLD 551
-#define MUPI_DIV_QR_THRESHOLD 49
-#define MU_BDIV_QR_THRESHOLD 492
+#define MUPI_DIV_QR_THRESHOLD 50
+#define MU_BDIV_QR_THRESHOLD 474
#define MU_BDIV_Q_THRESHOLD 492
-#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 55
-#define GCD_DC_THRESHOLD 150
-#define GCDEXT_DC_THRESHOLD 124
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 53
+#define HGCD_APPR_THRESHOLD 55
+#define HGCD_REDUCE_THRESHOLD 688
+#define GCD_DC_THRESHOLD 148
+#define GCDEXT_DC_THRESHOLD 118
#define JACOBI_BASE_METHOD 1
-#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_DC_THRESHOLD 16
#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 354
+#define SET_STR_DC_THRESHOLD 375
#define SET_STR_PRECOMPUTE_THRESHOLD 812
diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h
index 9a0932654..317bc94d6 100644
--- a/mpn/powerpc64/mode64/p4/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -29,6 +29,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 37
@@ -43,16 +44,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 62
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 181
-#define SQR_TOOM8_THRESHOLD 272
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM6_THRESHOLD 254
+#define SQR_TOOM8_THRESHOLD 430
-#define MULMOD_BNM1_THRESHOLD 13
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
#define SQRMOD_BNM1_THRESHOLD 16
+#define POWM_SEC_TABLE 6,47,347,1036,2826
+
#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 372, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
@@ -116,9 +121,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 103
#define SQR_FFT_THRESHOLD 2752
-#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_BASECASE_THRESHOLD 3
#define MULLO_DC_THRESHOLD 36
-#define MULLO_MUL_N_THRESHOLD 12691
+#define MULLO_MUL_N_THRESHOLD 13463
#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 158
@@ -139,12 +144,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 998
#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 105
+#define HGCD_THRESHOLD 103
+#define HGCD_APPR_THRESHOLD 110
+#define HGCD_REDUCE_THRESHOLD 1962
#define GCD_DC_THRESHOLD 318
#define GCDEXT_DC_THRESHOLD 242
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 858
-#define SET_STR_PRECOMPUTE_THRESHOLD 1864
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1781
diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h
index 827b555c8..9220f99d5 100644
--- a/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -1,4 +1,4 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
Software Foundation, Inc.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 40
@@ -38,22 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 121
#define MUL_TOOM6H_THRESHOLD 202
-#define MUL_TOOM8H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 260
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 82
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 36
-#define SQR_TOOM3_THRESHOLD 59
-#define SQR_TOOM4_THRESHOLD 147
-#define SQR_TOOM6_THRESHOLD 204
-#define SQR_TOOM8_THRESHOLD 288
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 191
+#define SQR_TOOM8_THRESHOLD 284
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,35,387,1068,2699
#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -166,15 +171,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 0
-#define MULLO_DC_THRESHOLD 31
+#define MULLO_DC_THRESHOLD 42
#define MULLO_MUL_N_THRESHOLD 6633
-#define DC_DIV_QR_THRESHOLD 37
+#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 155
#define DC_BDIV_QR_THRESHOLD 46
-#define DC_BDIV_Q_THRESHOLD 112
+#define DC_BDIV_Q_THRESHOLD 120
-#define INV_MULMOD_BNM1_THRESHOLD 26
+#define INV_MULMOD_BNM1_THRESHOLD 52
#define INV_NEWTON_THRESHOLD 177
#define INV_APPR_THRESHOLD 165
@@ -189,11 +194,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MATRIX22_STRASSEN_THRESHOLD 15
#define HGCD_THRESHOLD 108
-#define GCD_DC_THRESHOLD 303
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 315
#define GCDEXT_DC_THRESHOLD 237
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1639
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 000000000..4bd508488
--- /dev/null
+++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,172 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
+dnl Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 12.25 12.8
+C POWER7 ? ?
+
+C TODO
+C * Reduce register usage.
+C * Schedule function entry code.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(AM, `$1')
+ define(SM, `')
+ define(CLRRSC, `addic $1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(AM, `')
+ define(SM, `$1')
+ define(CLRRSC, `subfc $1, r0, r0')
+')
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r8, 0(up)
+ ld r7, 8(up)
+ ld r27, 16(up)
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r29, -16(rp)
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r5, r5, r29
+ b L(l3)
+
+L(b2): ld r7, 0(up)
+ ld r27, 8(up)
+ addi up, up, 8
+ addi rp, rp, 8
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r11, r11, r7
+ addze r12, r27
+ ADDSUB r9, r9, r30
+ b L(l2)
+
+L(b1): ld r27, 0(up)
+ ld r31, 0(rp)
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ADDSUB r11, r11, r31
+ b L(l1)
+
+L(b0): addi up, up, -8
+ addi rp, rp, -8
+ CLRRSC( r12) C clear r12 and clr/set cy
+
+ ALIGN(32)
+L(top):
+SM(` subfe r11, r0, r0') C complement...
+SM(` addic r11, r11, 1') C ...carry flag
+ ld r10, 8(up)
+ ld r8, 16(up)
+ ld r7, 24(up)
+ ld r27, 32(up)
+ addi up, up, 32
+ addi rp, rp, 32
+ mulld r0, r10, v0
+ mulhdu r10, r10, v0
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r28, -24(rp)
+ adde r0, r0, r12
+ ld r29, -16(rp)
+ adde r5, r5, r10
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ adde r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r0, r0, r28
+ std r0, -24(rp)
+ ADDSUBC r5, r5, r29
+L(l3): std r5, -16(rp)
+ ADDSUBC r9, r9, r30
+L(l2): std r9, -8(rp)
+ ADDSUBC r11, r11, r31
+L(l1): std r11, 0(rp)
+ bdnz L(top)
+
+AM(` addze r3, r12')
+SM(` subfe r11, r0, r0') C complement...
+ ld r31, -8(r1)
+SM(` subf r3, r11, r12')
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index d447b56d9..5392138f1 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -1,7 +1,7 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 21
@@ -38,23 +39,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 112
#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 430
+#define MUL_TOOM8H_THRESHOLD 339
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 53
-#define SQR_TOOM4_THRESHOLD 148
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 49
+#define SQR_TOOM4_THRESHOLD 136
#define SQR_TOOM6_THRESHOLD 226
-#define SQR_TOOM8_THRESHOLD 430
+#define SQR_TOOM8_THRESHOLD 393
+
+#define MULMID_TOOM42_THRESHOLD 36
#define MULMOD_BNM1_THRESHOLD 14
#define SQRMOD_BNM1_THRESHOLD 14
+#define POWM_SEC_TABLE 4,23,213,840,2618
+
#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
@@ -106,34 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2368
#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 112
-#define DC_BDIV_QR_THRESHOLD 29
-#define DC_BDIV_Q_THRESHOLD 86
-
-#define INV_MULMOD_BNM1_THRESHOLD 47
-#define INV_NEWTON_THRESHOLD 93
-#define INV_APPR_THRESHOLD 91
-
-#define BINV_NEWTON_THRESHOLD 132
-#define REDC_1_TO_REDC_N_THRESHOLD 39
-
-#define MU_DIV_QR_THRESHOLD 855
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 33
-#define MU_BDIV_QR_THRESHOLD 807
-#define MU_BDIV_Q_THRESHOLD 872
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 64
-#define GCD_DC_THRESHOLD 237
-#define GCDEXT_DC_THRESHOLD 183
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 3271
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 70
+#define DC_BDIV_Q_THRESHOLD 168
+
+#define INV_MULMOD_BNM1_THRESHOLD 61
+#define INV_NEWTON_THRESHOLD 166
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 222
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 59
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 108
+#define HGCD_REDUCE_THRESHOLD 1052
+#define GCD_DC_THRESHOLD 501
+#define GCDEXT_DC_THRESHOLD 249
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1648
+#define SET_STR_PRECOMPUTE_THRESHOLD 1639
diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm
index 427d6081a..52c5af8ff 100644
--- a/mpn/powerpc64/mode64/p6/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free
dnl Software Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 000000000..02603c525
--- /dev/null
+++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,159 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 3550 MHz POWER7 (gcc110.fsffrance.org) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 28
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 202
+#define MUL_TOOM6H_THRESHOLD 298
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 143
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 135
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 141
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 109
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 399
+
+#define MULMID_TOOM42_THRESHOLD 62
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define POWM_SEC_TABLE 6,65,342,1465
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \
+ { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \
+ { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 103
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 23
+#define MULLO_MUL_N_THRESHOLD 9174
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 124
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 81
+#define INV_NEWTON_THRESHOLD 165
+#define INV_APPR_THRESHOLD 133
+
+#define BINV_NEWTON_THRESHOLD 300
+#define REDC_1_TO_REDC_N_THRESHOLD 76
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 58
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 124
+#define HGCD_APPR_THRESHOLD 155
+#define HGCD_REDUCE_THRESHOLD 3134
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 333
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 1517
+#define SET_STR_PRECOMPUTE_THRESHOLD 3421
diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm
index 8af3ca774..2a5ef3060 100644
--- a/mpn/powerpc64/mode64/rsh1add_n.asm
+++ b/mpn/powerpc64/mode64/rsh1add_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm
index 1faa03379..b10eb8ab7 100644
--- a/mpn/powerpc64/mode64/rsh1sub_n.asm
+++ b/mpn/powerpc64/mode64/rsh1sub_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 000000000..72ac2d318
--- /dev/null
+++ b/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,852 @@
+dnl PowerPC-64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free
+dnl Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 16.25
+C POWER7 3.77
+
+C NOTES
+C * This is very crude, cleanup!
+C * Try to reduce the number of needed live registers.
+C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
+C cost will be more live registers.
+C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C size a lot and speed things up perhaps 25%.
+C * Use computed goto in order to compress the code.
+C * Implement a larger final corner.
+C * Schedule callee-saves register saves into other insns. This could save
+C about 5 cycles/call. (We cannot analogously optimise the restores, since
+C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C * Should the alternating std/adde sequences be split? Some pipelines handle
+C adde poorly, and might sequentialise all these instructions.
+C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C adjacent integer multiply insns. Except for the multiply insns, the code
+C was not carefully optimised for POWER6 or any other CPU.
+C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved', `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ cmpdi cr0, n, 2
+ bge cr0, L(ge2)
+ ld r5, 0(up) C n = 1
+ nop
+ mulld r8, r5, r5 C weight 0
+ mulhdu r9, r5, r5 C weight 1
+ std r8, 0(rp)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(ge2): bgt cr0, L(gt2)
+ ld r0, 0(up) C n = 2
+ nop
+ mulld r8, r0, r0 C u0 * u0
+ mulhdu r9, r0, r0 C u0 * u0
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r8, 0(rp)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+ ALIGN(16)
+L(gt2): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+
+ mr rp_saved, rp
+ mr up_saved, up
+ mr n_saved, n
+ mr rp_outer, rp
+ mr up_outer, up
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic r7, n, 2 C compute count...
+ srdi r7, r7, 2 C ...for ctr
+ mtctr r7 C copy count into ctr
+ beq- cr0, L(b0)
+ blt- cr6, L(b1)
+ beq- cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+ li r12, 0 C carry limb
+ bdz L(em3)
+
+ ALIGN(16)
+L(tm3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm3)
+
+L(em3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop)
+
+L(b0): ld r6, 0(up)
+ ld r27, 8(up)
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ std r7, 8(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(em0)
+
+ ALIGN(16)
+L(tm0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm0)
+
+L(em0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_2)
+
+L(b1): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ addc r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(em1)
+
+ ALIGN(16)
+L(tm1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm1)
+
+L(em1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_3)
+
+L(b2): addi r7, r7, -1 C FIXME
+ mtctr r7 C FIXME
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ std r0, 8(rp)
+ std r7, 16(rp)
+ std r11, 24(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(em2)
+
+ ALIGN(16)
+L(tm2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm2)
+
+L(em2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_0)
+
+
+L(outer_loop):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ bdz L(outer_end)
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(ea1)
+
+ ALIGN(16)
+L(ta1): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta1)
+
+L(ea1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_0):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r28
+ adde r7, r7, r26
+ addze r12, r8
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(ea0)
+
+ ALIGN(16)
+L(ta0): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta0)
+
+L(ea0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_3):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(ea3)
+
+ ALIGN(16)
+L(ta3): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta3)
+
+L(ea3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ bdz L(ea2)
+ addi up, up, 24
+
+ ALIGN(16)
+L(ta2): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta2)
+
+L(ea2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+ b L(outer_loop)
+
+L(outer_end):
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ addc r0, r0, r11
+ std r0, 0(rp)
+ addze r8, r8
+ std r8, 8(rp)
+
+define(`rp', `rp_saved')
+define(`up', `r5')
+define(`n', `r6')
+define(`climb', `r0')
+
+ addi r4, rp_saved, 8
+ mr r5, up_saved
+ mr r6, n_saved
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(xb0)
+ blt cr6, L(xb1)
+ beq cr6, L(xb2)
+
+L(xb3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(xb2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(xb0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ ld r12, 40(rp)
+ ld r23, 48(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(xb1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r8, 0(rp)
+ ld r9, 8(rp)
+ adde r8, r8, r8
+ adde r9, r9, r9
+ ld r10, 16(rp)
+ ld r11, 24(rp)
+ adde r10, r10, r10
+ adde r11, r11, r11
+ ld r6, 32(rp)
+ ld r7, 40(rp)
+ adde r6, r6, r6
+ adde r7, r7, r7
+ ld r12, 48(rp)
+ ld r23, 56(rp)
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze r31, r31
+ addc r8, r8, climb
+ std r8, 0(rp)
+ adde r9, r9, r24
+ std r9, 8(rp)
+ adde r10, r10, r25
+ std r10, 16(rp)
+ adde r11, r11, r26
+ std r11, 24(rp)
+ adde r6, r6, r27
+ std r6, 32(rp)
+ adde r7, r7, r28
+ std r7, 40(rp)
+ adde r12, r12, r29
+ std r12, 48(rp)
+ adde r23, r23, r30
+ std r23, 56(rp)
+ mr climb, r31
+ addi rp, rp, 64
+ bdnz L(top)
+
+L(end): addze climb, climb
+ std climb, 0(rp)
+
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
deleted file mode 100644
index 663f04c14..000000000
--- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl PowerPC-64 mpn_sqr_diag_addlsh1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 10
-C POWER4/PPC970 6
-C POWER5 5.375
-C POWER6 8.5
-
-C NOTES
-C * This was written for POWER6 and its preferences for adjacent integer
-C multiply insns. The cost is that we get a large set of live registers,
-C and therefore need to save 9 callee-saves registers. Except for the
-C multiply insns, the code was not carefully optimised for POWER6 or any
-C other CPU.
-C * Perform some cross-jumping in the feed-in code, into the loop's tail.
-
-C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`tp', `r4')
-define(`up', `r5')
-define(`n', `r6')
-
-define(`climb', `r0')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- std r26, -48(r1)
- std r25, -56(r1)
- std r24, -64(r1)
- std r23, -72(r1)
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 2 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C put loop count into ctr
- beq cr0, L(b0)
- blt cr6, L(b1)
- beq cr6, L(b2)
-
-L(b3): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- addi up, up, 24
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- addi tp, tp, 32
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- addze climb, r29
- addc r10, r10, r25
- adde r11, r11, r26
- adde r6, r6, r27
- adde r7, r7, r28
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- std r6, 24(rp)
- std r7, 32(rp)
- addi rp, rp, 40
- bdnz L(top)
- b L(end)
-
-L(b2): ld r6, 0(up)
- ld r7, 8(up)
- addi up, up, 16
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- ld r10, 0(tp)
- ld r11, 8(tp)
- addi tp, tp, 16
- addc r10, r10, r10
- adde r11, r11, r11
- addze climb, r27
- addc r10, r10, r25
- adde r11, r11, r26
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- addi rp, rp, 24
- bdnz L(top)
- b L(end)
-
-L(b0): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- ld r12, 32(tp)
- ld r23, 40(tp)
- addi tp, tp, 48
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- adde r12, r12, r12
- adde r23, r23, r23
- addze climb, r31
- std r24, 0(rp)
- addc r10, r10, r25
- std r10, 8(rp)
- adde r11, r11, r26
- std r11, 16(rp)
- adde r6, r6, r27
- std r6, 24(rp)
- adde r7, r7, r28
- std r7, 32(rp)
- adde r12, r12, r29
- std r12, 40(rp)
- adde r23, r23, r30
- std r23, 48(rp)
- addi rp, rp, 56
- bdnz L(top)
- b L(end)
-
-L(b1): ld r6, 0(up)
- addi up, up, 8
- mulld r24, r6, r6
- mulhdu climb, r6, r6
- std r24, 0(rp)
- addic rp, rp, 8 C clear carry as side-effect
-
- ALIGN(32)
-L(top): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r8, 0(tp)
- ld r9, 8(tp)
- adde r8, r8, r8
- adde r9, r9, r9
- ld r10, 16(tp)
- ld r11, 24(tp)
- adde r10, r10, r10
- adde r11, r11, r11
- ld r6, 32(tp)
- ld r7, 40(tp)
- adde r6, r6, r6
- adde r7, r7, r7
- ld r12, 48(tp)
- ld r23, 56(tp)
- adde r12, r12, r12
- adde r23, r23, r23
- addi tp, tp, 64
- addze r31, r31
- addc r8, r8, climb
- std r8, 0(rp)
- adde r9, r9, r24
- std r9, 8(rp)
- adde r10, r10, r25
- std r10, 16(rp)
- adde r11, r11, r26
- std r11, 24(rp)
- adde r6, r6, r27
- std r6, 32(rp)
- adde r7, r7, r28
- std r7, 40(rp)
- adde r12, r12, r29
- std r12, 48(rp)
- adde r23, r23, r30
- std r23, 56(rp)
- mr climb, r31
- addi rp, rp, 64
- bdnz L(top)
-
-L(end): addze climb, climb
- std climb, 0(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- ld r26, -48(r1)
- ld r25, -56(r1)
- ld r24, -64(r1)
- ld r23, -72(r1)
- blr
-EPILOGUE()
diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm
index 6545af769..18406c57e 100644
--- a/mpn/powerpc64/rshift.asm
+++ b/mpn/powerpc64/rshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
new file mode 100644
index 000000000..7d189388b
--- /dev/null
+++ b/mpn/powerpc64/tabselect.asm
@@ -0,0 +1,96 @@
+dnl PowerPC-64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 3.3
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.5
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srdi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ sldi n, n, 3
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ ld r9, 0(tp)
+ addi tp, tp, 8
+ and r9, r9, mask
+ ld r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ std r9, 0(rp)
+ addi rp, rp, 8
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r9, 0(tp)
+ ld r10, 8(tp)
+ addi tp, tp, 16
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ ld r11, 0(rp)
+ ld r12, 8(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ std r9, 0(rp)
+ std r10, 8(rp)
+ addi rp, rp, 16
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpdi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()